diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -266,7 +266,10 @@ Value *get(VPValue *Def, unsigned Part); /// Get the generated Value for a given VPValue and given Part and Lane. - Value *get(VPValue *Def, const VPIteration &Instance); + Value *get(VPValue *Def, VPIteration Instance); + + Value *get(VPValue *Def, + const std::variant &LaneOrIter); bool hasVectorValue(VPValue *Def, unsigned Part) { auto I = Data.PerPartOutput.find(Def); @@ -289,13 +292,8 @@ } /// Set the generated Value for a given VPValue and a given Part. - void set(VPValue *Def, Value *V, unsigned Part) { - if (!Data.PerPartOutput.count(Def)) { - DataState::PerPartValuesTy Entry(UF); - Data.PerPartOutput[Def] = Entry; - } - Data.PerPartOutput[Def][Part] = V; - } + void set(VPValue *Def, Value *V, unsigned Part); + /// Reset an existing vector value for \p Def and a given \p Part. void reset(VPValue *Def, Value *V, unsigned Part) { auto Iter = Data.PerPartOutput.find(Def); @@ -305,18 +303,7 @@ } /// Set the generated scalar \p V for \p Def and the given \p Instance. - void set(VPValue *Def, Value *V, const VPIteration &Instance) { - auto Iter = Data.PerPartScalars.insert({Def, {}}); - auto &PerPartVec = Iter.first->second; - while (PerPartVec.size() <= Instance.Part) - PerPartVec.emplace_back(); - auto &Scalars = PerPartVec[Instance.Part]; - unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); - while (Scalars.size() <= CacheIdx) - Scalars.push_back(nullptr); - assert(!Scalars[CacheIdx] && "should overwrite existing value"); - Scalars[CacheIdx] = V; - } + void set(VPValue *Def, Value *V, const VPIteration &Instance); /// Reset an existing scalar value for \p Def and a given \p Instance. void reset(VPValue *Def, Value *V, const VPIteration &Instance) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -213,7 +213,43 @@ return It; } -Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) { +void VPTransformState::set(VPValue *Def, Value *V, unsigned Part) { + if (!Data.PerPartOutput.count(Def)) { + DataState::PerPartValuesTy Entry(UF); + Data.PerPartOutput[Def] = Entry; + } + + if (Def->isDefinedOutsideVectorRegions()) { + for (unsigned Part = 0; Part != UF; Part++) + Data.PerPartOutput[Def][Part] = V; + } else + Data.PerPartOutput[Def][Part] = V; +} + +void VPTransformState::set(VPValue *Def, Value *V, + const VPIteration &Instance) { + auto Iter = Data.PerPartScalars.insert({Def, {}}); + + auto SetForPart = [this, Iter, V](VPLane Lane, unsigned Part) { + auto &PerPartVec = Iter.first->second; + while (PerPartVec.size() <= Part) + PerPartVec.emplace_back(); + auto &Scalars = PerPartVec[Part]; + unsigned CacheIdx = Lane.mapToCacheIndex(VF); + while (Scalars.size() <= CacheIdx) + Scalars.push_back(nullptr); + assert(!Scalars[CacheIdx] && "should overwrite existing value"); + Scalars[CacheIdx] = V; + }; + if (Def->isDefinedOutsideVectorRegions()) { + for (unsigned Part = 0; Part != UF; ++Part) { + SetForPart(Instance.Lane, Part); + } + } else + SetForPart(Instance.Lane, Instance.Part); +} + +Value *VPTransformState::get(VPValue *Def, VPIteration Instance) { if (Def->isLiveIn()) return Def->getLiveInIRValue(); @@ -234,6 +270,7 @@ // set(Def, Extract, Instance); return Extract; } + BasicBlock *VPTransformState::CFGState::getPreheaderBBFor(VPRecipeBase *R) { VPRegionBlock *LoopRegion = R->getParent()->getEnclosingLoopRegion(); return VPBB2IRBB[LoopRegion->getPreheaderVPBB()]; @@ -1058,6 +1095,10 @@ } #endif +bool VPValue::isDefinedOutsideVectorRegions() const { + return !hasDefiningRecipe() || !getDefiningRecipe()->getParent()->getParent(); +} + void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New, InterleavedAccessInfo &IAI) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -391,8 +391,12 @@ assert(!State.Instance && "VPInstruction executing an Instance"); IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); State.Builder.setFastMathFlags(FMF); - for (unsigned Part = 0; Part < State.UF; ++Part) - generateInstruction(State, Part); + if (isDefinedOutsideVectorRegions()) { + generateInstruction(State, 0); + } else { + for (unsigned Part = 0; Part < State.UF; ++Part) + generateInstruction(State, Part); + } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1177,8 +1181,7 @@ assert(!State.ExpandedSCEVs.contains(Expr) && "Same SCEV expanded multiple times"); State.ExpandedSCEVs[Expr] = Res; - for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) - State.set(this, Res, {Part, 0}); + State.set(this, Res, {0, 0}); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -190,8 +190,7 @@ /// Returns true if the VPValue is defined outside any vector regions, i.e. it /// is a live-in value. - /// TODO: Also handle recipes defined in pre-header blocks. - bool isDefinedOutsideVectorRegions() const { return !hasDefiningRecipe(); } + bool isDefinedOutsideVectorRegions() const; }; typedef DenseMap Value2VPValueTy; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll @@ -27,20 +27,8 @@ ; INTERLEAVE-4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; INTERLEAVE-4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B:%.*]], i64 0 ; INTERLEAVE-4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer -; INTERLEAVE-4-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0 +; INTERLEAVE-4-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i64 0 ; INTERLEAVE-4-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT7]], <16 x i8> poison, <16 x i32> zeroinitializer -; INTERLEAVE-4-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0 -; INTERLEAVE-4-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT9]], <16 x i8> poison, <16 x i32> zeroinitializer -; INTERLEAVE-4-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0 -; INTERLEAVE-4-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT11]], <16 x i8> poison, <16 x i32> zeroinitializer -; INTERLEAVE-4-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i64 0 -; INTERLEAVE-4-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT13]], <16 x i8> poison, <16 x i32> zeroinitializer -; INTERLEAVE-4-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 -; INTERLEAVE-4-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT15]], <16 x i8> poison, <16 x i32> zeroinitializer -; INTERLEAVE-4-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 -; INTERLEAVE-4-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT17]], <16 x i8> poison, <16 x i32> zeroinitializer -; INTERLEAVE-4-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 -; INTERLEAVE-4-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT19]], <16 x i8> poison, <16 x i32> zeroinitializer ; INTERLEAVE-4-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE-4: vector.body: ; INTERLEAVE-4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -61,17 +49,17 @@ ; INTERLEAVE-4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 48 ; INTERLEAVE-4-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 ; INTERLEAVE-4-NEXT: [[TMP13:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; INTERLEAVE-4-NEXT: [[TMP14:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD4]], [[BROADCAST_SPLAT8]] -; INTERLEAVE-4-NEXT: [[TMP15:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD5]], [[BROADCAST_SPLAT10]] -; INTERLEAVE-4-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD6]], [[BROADCAST_SPLAT12]] -; INTERLEAVE-4-NEXT: [[TMP17:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT14]]) -; INTERLEAVE-4-NEXT: [[TMP18:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[WIDE_LOAD4]], <16 x i8> [[BROADCAST_SPLAT16]]) -; INTERLEAVE-4-NEXT: [[TMP19:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[WIDE_LOAD5]], <16 x i8> [[BROADCAST_SPLAT18]]) -; INTERLEAVE-4-NEXT: [[TMP20:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[WIDE_LOAD6]], <16 x i8> [[BROADCAST_SPLAT20]]) +; INTERLEAVE-4-NEXT: [[TMP14:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD4]], [[BROADCAST_SPLAT]] +; INTERLEAVE-4-NEXT: [[TMP15:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD5]], [[BROADCAST_SPLAT]] +; INTERLEAVE-4-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD6]], [[BROADCAST_SPLAT]] +; INTERLEAVE-4-NEXT: [[TMP17:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT8]]) +; INTERLEAVE-4-NEXT: [[TMP18:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[WIDE_LOAD4]], <16 x i8> [[BROADCAST_SPLAT8]]) +; INTERLEAVE-4-NEXT: [[TMP19:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[WIDE_LOAD5]], <16 x i8> [[BROADCAST_SPLAT8]]) +; INTERLEAVE-4-NEXT: [[TMP20:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[WIDE_LOAD6]], <16 x i8> [[BROADCAST_SPLAT8]]) ; INTERLEAVE-4-NEXT: [[TMP21:%.*]] = select <16 x i1> [[TMP13]], <16 x i8> [[BROADCAST_SPLAT]], <16 x i8> [[TMP17]] -; INTERLEAVE-4-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP14]], <16 x i8> [[BROADCAST_SPLAT8]], <16 x i8> [[TMP18]] -; INTERLEAVE-4-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP15]], <16 x i8> [[BROADCAST_SPLAT10]], <16 x i8> [[TMP19]] -; INTERLEAVE-4-NEXT: [[TMP24:%.*]] = select <16 x i1> [[TMP16]], <16 x i8> [[BROADCAST_SPLAT12]], <16 x i8> [[TMP20]] +; INTERLEAVE-4-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP14]], <16 x i8> [[BROADCAST_SPLAT]], <16 x i8> [[TMP18]] +; INTERLEAVE-4-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP15]], <16 x i8> [[BROADCAST_SPLAT]], <16 x i8> [[TMP19]] +; INTERLEAVE-4-NEXT: [[TMP24:%.*]] = select <16 x i1> [[TMP16]], <16 x i8> [[BROADCAST_SPLAT]], <16 x i8> [[TMP20]] ; INTERLEAVE-4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP1]] ; INTERLEAVE-4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP2]] ; INTERLEAVE-4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP3]] @@ -96,33 +84,33 @@ ; INTERLEAVE-4-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; INTERLEAVE-4: vec.epilog.ph: ; INTERLEAVE-4-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; INTERLEAVE-4-NEXT: [[N_MOD_VF21:%.*]] = urem i64 [[N]], 8 -; INTERLEAVE-4-NEXT: [[N_VEC22:%.*]] = sub i64 [[N]], [[N_MOD_VF21]] -; INTERLEAVE-4-NEXT: [[BROADCAST_SPLATINSERT26:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 -; INTERLEAVE-4-NEXT: [[BROADCAST_SPLAT27:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT26]], <8 x i8> poison, <8 x i32> zeroinitializer -; INTERLEAVE-4-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 -; INTERLEAVE-4-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT28]], <8 x i8> poison, <8 x i32> zeroinitializer +; INTERLEAVE-4-NEXT: [[N_MOD_VF9:%.*]] = urem i64 [[N]], 8 +; INTERLEAVE-4-NEXT: [[N_VEC10:%.*]] = sub i64 [[N]], [[N_MOD_VF9]] +; INTERLEAVE-4-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +; INTERLEAVE-4-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT14]], <8 x i8> poison, <8 x i32> zeroinitializer +; INTERLEAVE-4-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +; INTERLEAVE-4-NEXT: [[BROADCAST_SPLAT17:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT16]], <8 x i8> poison, <8 x i32> zeroinitializer ; INTERLEAVE-4-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; INTERLEAVE-4: vec.epilog.vector.body: -; INTERLEAVE-4-NEXT: [[INDEX24:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT30:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; INTERLEAVE-4-NEXT: [[TMP34:%.*]] = add i64 [[INDEX24]], 0 +; INTERLEAVE-4-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT18:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; INTERLEAVE-4-NEXT: [[TMP34:%.*]] = add i64 [[INDEX12]], 0 ; INTERLEAVE-4-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP34]] ; INTERLEAVE-4-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i32 0 -; INTERLEAVE-4-NEXT: [[WIDE_LOAD25:%.*]] = load <8 x i8>, ptr [[TMP36]], align 1 -; INTERLEAVE-4-NEXT: [[TMP37:%.*]] = icmp sgt <8 x i8> [[WIDE_LOAD25]], [[BROADCAST_SPLAT27]] -; INTERLEAVE-4-NEXT: [[TMP38:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[WIDE_LOAD25]], <8 x i8> [[BROADCAST_SPLAT29]]) -; INTERLEAVE-4-NEXT: [[TMP39:%.*]] = select <8 x i1> [[TMP37]], <8 x i8> [[BROADCAST_SPLAT27]], <8 x i8> [[TMP38]] +; INTERLEAVE-4-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i8>, ptr [[TMP36]], align 1 +; INTERLEAVE-4-NEXT: [[TMP37:%.*]] = icmp sgt <8 x i8> [[WIDE_LOAD13]], [[BROADCAST_SPLAT15]] +; INTERLEAVE-4-NEXT: [[TMP38:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[WIDE_LOAD13]], <8 x i8> [[BROADCAST_SPLAT17]]) +; INTERLEAVE-4-NEXT: [[TMP39:%.*]] = select <8 x i1> [[TMP37]], <8 x i8> [[BROADCAST_SPLAT15]], <8 x i8> [[TMP38]] ; INTERLEAVE-4-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP34]] ; INTERLEAVE-4-NEXT: [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[TMP40]], i32 0 ; INTERLEAVE-4-NEXT: store <8 x i8> [[TMP39]], ptr [[TMP41]], align 1 -; INTERLEAVE-4-NEXT: [[INDEX_NEXT30]] = add nuw i64 [[INDEX24]], 8 -; INTERLEAVE-4-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT30]], [[N_VEC22]] +; INTERLEAVE-4-NEXT: [[INDEX_NEXT18]] = add nuw i64 [[INDEX12]], 8 +; INTERLEAVE-4-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT18]], [[N_VEC10]] ; INTERLEAVE-4-NEXT: br i1 [[TMP42]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; INTERLEAVE-4: vec.epilog.middle.block: -; INTERLEAVE-4-NEXT: [[CMP_N23:%.*]] = icmp eq i64 [[N]], [[N_VEC22]] -; INTERLEAVE-4-NEXT: br i1 [[CMP_N23]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; INTERLEAVE-4-NEXT: [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[N_VEC10]] +; INTERLEAVE-4-NEXT: br i1 [[CMP_N11]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; INTERLEAVE-4: vec.epilog.scalar.ph: -; INTERLEAVE-4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; INTERLEAVE-4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; INTERLEAVE-4-NEXT: br label [[LOOP:%.*]] ; INTERLEAVE-4: loop: ; INTERLEAVE-4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -157,12 +145,8 @@ ; INTERLEAVE-2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; INTERLEAVE-2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B:%.*]], i64 0 ; INTERLEAVE-2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer -; INTERLEAVE-2-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0 +; INTERLEAVE-2-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i64 0 ; INTERLEAVE-2-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT5]], <16 x i8> poison, <16 x i32> zeroinitializer -; INTERLEAVE-2-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i64 0 -; INTERLEAVE-2-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT7]], <16 x i8> poison, <16 x i32> zeroinitializer -; INTERLEAVE-2-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 -; INTERLEAVE-2-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT9]], <16 x i8> poison, <16 x i32> zeroinitializer ; INTERLEAVE-2-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE-2: vector.body: ; INTERLEAVE-2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -175,11 +159,11 @@ ; INTERLEAVE-2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 16 ; INTERLEAVE-2-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 ; INTERLEAVE-2-NEXT: [[TMP7:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; INTERLEAVE-2-NEXT: [[TMP8:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD4]], [[BROADCAST_SPLAT6]] -; INTERLEAVE-2-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT8]]) -; INTERLEAVE-2-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[WIDE_LOAD4]], <16 x i8> [[BROADCAST_SPLAT10]]) +; INTERLEAVE-2-NEXT: [[TMP8:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD4]], [[BROADCAST_SPLAT]] +; INTERLEAVE-2-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT6]]) +; INTERLEAVE-2-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[WIDE_LOAD4]], <16 x i8> [[BROADCAST_SPLAT6]]) ; INTERLEAVE-2-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP7]], <16 x i8> [[BROADCAST_SPLAT]], <16 x i8> [[TMP9]] -; INTERLEAVE-2-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP8]], <16 x i8> [[BROADCAST_SPLAT6]], <16 x i8> [[TMP10]] +; INTERLEAVE-2-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP8]], <16 x i8> [[BROADCAST_SPLAT]], <16 x i8> [[TMP10]] ; INTERLEAVE-2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP1]] ; INTERLEAVE-2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP2]] ; INTERLEAVE-2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 @@ -198,33 +182,33 @@ ; INTERLEAVE-2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; INTERLEAVE-2: vec.epilog.ph: ; INTERLEAVE-2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; INTERLEAVE-2-NEXT: [[N_MOD_VF11:%.*]] = urem i64 [[N]], 8 -; INTERLEAVE-2-NEXT: [[N_VEC12:%.*]] = sub i64 [[N]], [[N_MOD_VF11]] -; INTERLEAVE-2-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 -; INTERLEAVE-2-NEXT: [[BROADCAST_SPLAT17:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT16]], <8 x i8> poison, <8 x i32> zeroinitializer -; INTERLEAVE-2-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 -; INTERLEAVE-2-NEXT: [[BROADCAST_SPLAT19:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT18]], <8 x i8> poison, <8 x i32> zeroinitializer +; INTERLEAVE-2-NEXT: [[N_MOD_VF7:%.*]] = urem i64 [[N]], 8 +; INTERLEAVE-2-NEXT: [[N_VEC8:%.*]] = sub i64 [[N]], [[N_MOD_VF7]] +; INTERLEAVE-2-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +; INTERLEAVE-2-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT12]], <8 x i8> poison, <8 x i32> zeroinitializer +; INTERLEAVE-2-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +; INTERLEAVE-2-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT14]], <8 x i8> poison, <8 x i32> zeroinitializer ; INTERLEAVE-2-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; INTERLEAVE-2: vec.epilog.vector.body: -; INTERLEAVE-2-NEXT: [[INDEX14:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT20:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; INTERLEAVE-2-NEXT: [[TMP18:%.*]] = add i64 [[INDEX14]], 0 +; INTERLEAVE-2-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; INTERLEAVE-2-NEXT: [[TMP18:%.*]] = add i64 [[INDEX10]], 0 ; INTERLEAVE-2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP18]] ; INTERLEAVE-2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i32 0 -; INTERLEAVE-2-NEXT: [[WIDE_LOAD15:%.*]] = load <8 x i8>, ptr [[TMP20]], align 1 -; INTERLEAVE-2-NEXT: [[TMP21:%.*]] = icmp sgt <8 x i8> [[WIDE_LOAD15]], [[BROADCAST_SPLAT17]] -; INTERLEAVE-2-NEXT: [[TMP22:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[WIDE_LOAD15]], <8 x i8> [[BROADCAST_SPLAT19]]) -; INTERLEAVE-2-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP21]], <8 x i8> [[BROADCAST_SPLAT17]], <8 x i8> [[TMP22]] +; INTERLEAVE-2-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, ptr [[TMP20]], align 1 +; INTERLEAVE-2-NEXT: [[TMP21:%.*]] = icmp sgt <8 x i8> [[WIDE_LOAD11]], [[BROADCAST_SPLAT13]] +; INTERLEAVE-2-NEXT: [[TMP22:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[WIDE_LOAD11]], <8 x i8> [[BROADCAST_SPLAT15]]) +; INTERLEAVE-2-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP21]], <8 x i8> [[BROADCAST_SPLAT13]], <8 x i8> [[TMP22]] ; INTERLEAVE-2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP18]] ; INTERLEAVE-2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP24]], i32 0 ; INTERLEAVE-2-NEXT: store <8 x i8> [[TMP23]], ptr [[TMP25]], align 1 -; INTERLEAVE-2-NEXT: [[INDEX_NEXT20]] = add nuw i64 [[INDEX14]], 8 -; INTERLEAVE-2-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT20]], [[N_VEC12]] +; INTERLEAVE-2-NEXT: [[INDEX_NEXT16]] = add nuw i64 [[INDEX10]], 8 +; INTERLEAVE-2-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC8]] ; INTERLEAVE-2-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; INTERLEAVE-2: vec.epilog.middle.block: -; INTERLEAVE-2-NEXT: [[CMP_N13:%.*]] = icmp eq i64 [[N]], [[N_VEC12]] -; INTERLEAVE-2-NEXT: br i1 [[CMP_N13]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; INTERLEAVE-2-NEXT: [[CMP_N9:%.*]] = icmp eq i64 [[N]], [[N_VEC8]] +; INTERLEAVE-2-NEXT: br i1 [[CMP_N9]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; INTERLEAVE-2: vec.epilog.scalar.ph: -; INTERLEAVE-2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC12]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; INTERLEAVE-2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; INTERLEAVE-2-NEXT: br label [[LOOP:%.*]] ; INTERLEAVE-2: loop: ; INTERLEAVE-2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll @@ -31,8 +31,6 @@ ; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[TMP2]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT9]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -44,7 +42,7 @@ ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 8 ; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP5]], align 2 ; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD]], <8 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD8]], <8 x i16> [[BROADCAST_SPLAT10]]) +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD8]], <8 x i16> [[BROADCAST_SPLAT]]) ; CHECK-NEXT: store <8 x i16> [[TMP6]], ptr [[NEXT_GEP6]], align 2 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[NEXT_GEP6]], i64 8 ; CHECK-NEXT: store <8 x i16> [[TMP7]], ptr [[TMP8]], align 2 @@ -118,8 +116,6 @@ ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967264 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT6]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -129,7 +125,7 @@ ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16 ; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP1]], align 2 ; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD5]], <16 x i8> [[BROADCAST_SPLAT7]]) +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD5]], <16 x i8> [[BROADCAST_SPLAT]]) ; CHECK-NEXT: store <16 x i8> [[TMP2]], ptr [[NEXT_GEP3]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 16 ; CHECK-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP4]], align 2 @@ -140,45 +136,45 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[IND_END20:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]] -; CHECK-NEXT: [[IND_END17:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]] -; CHECK-NEXT: [[DOTCAST13:%.*]] = trunc i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[IND_END14:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST13]] +; CHECK-NEXT: [[IND_END18:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]] +; CHECK-NEXT: [[IND_END15:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]] +; CHECK-NEXT: [[DOTCAST11:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END12:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST11]] ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 24 ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC11:%.*]] = and i64 [[TMP0]], 4294967288 -; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC11]] to i32 -; CHECK-NEXT: [[IND_END12:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] -; CHECK-NEXT: [[IND_END16:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC11]] -; CHECK-NEXT: [[IND_END19:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC11]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT27]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[N_VEC9:%.*]] = and i64 [[TMP0]], 4294967288 +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC9]] to i32 +; CHECK-NEXT: [[IND_END10:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] +; CHECK-NEXT: [[IND_END14:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC9]] +; CHECK-NEXT: [[IND_END17:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC9]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT25:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT26:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT25]], <8 x i8> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX23:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT29:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[NEXT_GEP24:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[INDEX23]] -; CHECK-NEXT: [[NEXT_GEP25:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[INDEX23]] -; CHECK-NEXT: [[WIDE_LOAD26:%.*]] = load <8 x i8>, ptr [[NEXT_GEP24]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD26]], <8 x i8> [[BROADCAST_SPLAT28]]) -; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[NEXT_GEP25]], align 2 -; CHECK-NEXT: [[INDEX_NEXT29]] = add nuw i64 [[INDEX23]], 8 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT29]], [[N_VEC11]] +; CHECK-NEXT: [[INDEX21:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP22:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[INDEX21]] +; CHECK-NEXT: [[NEXT_GEP23:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[INDEX21]] +; CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i8>, ptr [[NEXT_GEP22]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD24]], <8 x i8> [[BROADCAST_SPLAT26]]) +; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[NEXT_GEP23]], align 2 +; CHECK-NEXT: [[INDEX_NEXT27]] = add nuw i64 [[INDEX21]], 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT27]], [[N_VEC9]] ; CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N22:%.*]] = icmp eq i64 [[N_VEC11]], [[TMP0]] -; CHECK-NEXT: br i1 [[CMP_N22]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[N_VEC9]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP_N20]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i32 [ [[IND_END12]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END14]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL18:%.*]] = phi ptr [ [[IND_END16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END17]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL21:%.*]] = phi ptr [ [[IND_END19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END20]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi i32 [ [[IND_END10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL16:%.*]] = phi ptr [ [[IND_END14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END15]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL19:%.*]] = phi ptr [ [[IND_END17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END18]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: -; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL15]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL18]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL21]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL13]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL16]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL19]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PSRC_ADDR_08]], i64 1 ; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[PSRC_ADDR_08]], align 2 ; CHECK-NEXT: [[TMP9:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP8]], i8 [[OFFSET]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -397,127 +397,100 @@ ; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]] -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 24 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]] -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = sub i64 [[N]], [[TMP12]] -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[N]], [[TMP12]] -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = sub i64 [[N]], [[TMP17]] -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[N]], [[TMP17]] -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = sub i64 [[N]], [[TMP22]] -; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = icmp ugt i64 [[N]], [[TMP22]] -; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = sub i64 [[N]], [[TMP27]] -; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = icmp ugt i64 [[N]], [[TMP27]] -; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i64 [[TMP28]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT12:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT13:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT14:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP68:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = add i64 [[INDEX]], [[TMP35]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT7:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT8:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT9:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]] +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], [[TMP19]] +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = add i64 [[TMP22]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], [[TMP24]] +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP10]] +; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP15]] +; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP20]] +; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP25]] +; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP30]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP32]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP33]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP35]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP36]], i32 4, [[ACTIVE_LANE_MASK2]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]] -; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], [[TMP45]] -; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP31]] -; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP36]] -; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP41]] -; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP46]] -; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP53]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK6]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP56]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK7]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP59]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP61]]) -; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = select [[ACTIVE_LANE_MASK6]], [[WIDE_MASKED_LOAD9]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP62]], [[TMP63]]) -; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = select [[ACTIVE_LANE_MASK7]], [[WIDE_MASKED_LOAD10]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP64]], [[TMP65]]) -; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = select [[ACTIVE_LANE_MASK8]], [[WIDE_MASKED_LOAD11]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP68]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP66]], [[TMP67]]) -; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = add i64 [[INDEX]], [[TMP70]] -; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = mul i64 [[TMP72]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = add i64 [[INDEX]], [[TMP73]] -; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = mul i64 [[TMP75]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = add i64 [[INDEX]], [[TMP76]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP71]], i64 [[TMP20]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP74]], i64 [[TMP25]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP77]], i64 [[TMP30]]) -; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = mul i64 [[TMP78]], 32 -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP79]] -; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = extractelement [[TMP80]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP38]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP39]], i32 4, [[ACTIVE_LANE_MASK3]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP40]]) +; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = select [[ACTIVE_LANE_MASK1]], [[WIDE_MASKED_LOAD4]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP41]], [[TMP42]]) +; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = select [[ACTIVE_LANE_MASK2]], [[WIDE_MASKED_LOAD5]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP43]], [[TMP44]]) +; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = select [[ACTIVE_LANE_MASK3]], [[WIDE_MASKED_LOAD6]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP47]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP45]], [[TMP46]]) +; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = mul i64 [[TMP48]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = add i64 [[INDEX]], [[TMP49]] +; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = mul i64 [[TMP51]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = add i64 [[INDEX]], [[TMP52]] +; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = mul i64 [[TMP54]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = add i64 [[INDEX]], [[TMP55]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT7]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP50]], i64 [[TMP9]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT8]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP53]], i64 [[TMP9]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT9]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP56]], i64 [[TMP9]]) +; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = mul i64 [[TMP57]], 32 +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP58]] +; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = extractelement [[TMP59]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP63]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP68]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP47]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-ORDERED-TF-NEXT: [[ADD]] = fadd float [[TMP85]], [[SUM_07]] +; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ADD]] = fadd float [[TMP64]], [[SUM_07]] ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP68]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP47]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: ret float [[ADD_LCSSA]] ; @@ -1716,151 +1689,124 @@ ; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]] -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 24 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]] -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = sub i64 [[N]], [[TMP12]] -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[N]], [[TMP12]] -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = sub i64 [[N]], [[TMP17]] -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[N]], [[TMP17]] -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = sub i64 [[N]], [[TMP22]] -; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = icmp ugt i64 [[N]], [[TMP22]] -; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = sub i64 [[N]], [[TMP27]] -; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = icmp ugt i64 [[N]], [[TMP27]] -; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i64 [[TMP28]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP86:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = add i64 [[INDEX]], [[TMP35]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT11:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT12:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT13:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]] +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], [[TMP19]] +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = add i64 [[TMP22]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], [[TMP24]] +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP10]] +; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP15]] +; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP20]] +; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP25]] +; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP30]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP32]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP33]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP35]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP36]], i32 4, [[ACTIVE_LANE_MASK2]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]] -; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], [[TMP45]] -; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP31]] -; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP36]] -; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP41]] -; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP46]] -; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP53]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK6]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP56]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK7]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP59]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP31]] -; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP36]] -; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP41]] -; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP46]] -; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP65]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP38]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP39]], i32 4, [[ACTIVE_LANE_MASK3]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP10]] +; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP15]] +; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP20]] +; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP25]] +; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP40]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP44]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP40]], i64 [[TMP46]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP47]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = mul i64 [[TMP48]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[TMP40]], i64 [[TMP49]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP50]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = mul i64 [[TMP51]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = getelementptr inbounds float, ptr [[TMP40]], i64 [[TMP52]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP53]], i32 4, [[ACTIVE_LANE_MASK3]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = fmul [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD7]] +; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = fmul [[WIDE_MASKED_LOAD4]], [[WIDE_MASKED_LOAD8]] +; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = fmul [[WIDE_MASKED_LOAD5]], [[WIDE_MASKED_LOAD9]] +; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = fmul [[WIDE_MASKED_LOAD6]], [[WIDE_MASKED_LOAD10]] +; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP54]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP58]]) +; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = select [[ACTIVE_LANE_MASK1]], [[TMP55]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP59]], [[TMP60]]) +; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP56]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP61]], [[TMP62]]) +; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = select [[ACTIVE_LANE_MASK3]], [[TMP57]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP65]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP63]], [[TMP64]]) ; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = mul i64 [[TMP66]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP67]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP68]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = add i64 [[INDEX]], [[TMP67]] ; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP70]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP71]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = add i64 [[INDEX]], [[TMP70]] ; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = mul i64 [[TMP72]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP73]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP74]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = fmul [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]] -; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = fmul [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]] -; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = fmul [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]] -; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = fmul [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]] -; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP75]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP79]]) -; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = select [[ACTIVE_LANE_MASK6]], [[TMP76]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP80]], [[TMP81]]) -; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = select [[ACTIVE_LANE_MASK7]], [[TMP77]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP82]], [[TMP83]]) -; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP78]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP86]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP84]], [[TMP85]]) -; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = add i64 [[INDEX]], [[TMP88]] -; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP91:%.*]] = mul i64 [[TMP90]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP92:%.*]] = add i64 [[INDEX]], [[TMP91]] -; CHECK-ORDERED-TF-NEXT: [[TMP93:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP94:%.*]] = mul i64 [[TMP93]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP95:%.*]] = add i64 [[INDEX]], [[TMP94]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP89]], i64 [[TMP20]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP92]], i64 [[TMP25]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP95]], i64 [[TMP30]]) -; CHECK-ORDERED-TF-NEXT: [[TMP96:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP97:%.*]] = mul i64 [[TMP96]], 32 -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP97]] -; CHECK-ORDERED-TF-NEXT: [[TMP98:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP99:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP100:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP101:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP102:%.*]] = extractelement [[TMP98]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP102]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = add i64 [[INDEX]], [[TMP73]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT11]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP68]], i64 [[TMP9]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP71]], i64 [[TMP9]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP74]], i64 [[TMP9]]) +; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = mul i64 [[TMP75]], 32 +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP76]] +; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = extractelement [[TMP77]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP81]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP65]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP103:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP104:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; CHECK-ORDERED-TF-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP103]], float [[TMP104]], float [[SUM_07]]) +; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP82]], float [[TMP83]], float [[SUM_07]]) ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP65]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: ret float [[MULADD_LCSSA]] ; @@ -2130,151 +2076,124 @@ ; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]] -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 24 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]] -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = sub i64 [[N]], [[TMP12]] -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[N]], [[TMP12]] -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = sub i64 [[N]], [[TMP17]] -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[N]], [[TMP17]] -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = sub i64 [[N]], [[TMP22]] -; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = icmp ugt i64 [[N]], [[TMP22]] -; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = sub i64 [[N]], [[TMP27]] -; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = icmp ugt i64 [[N]], [[TMP27]] -; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i64 [[TMP28]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP86:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = add i64 [[INDEX]], [[TMP35]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT11:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT12:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT13:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]] +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], [[TMP19]] +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = add i64 [[TMP22]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], [[TMP24]] +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP10]] +; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP15]] +; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP20]] +; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP25]] +; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP30]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP32]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP33]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP35]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP36]], i32 4, [[ACTIVE_LANE_MASK2]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]] -; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], [[TMP45]] -; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP31]] -; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP36]] -; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP41]] -; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP46]] -; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP53]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK6]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP56]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK7]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP59]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP31]] -; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP36]] -; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP41]] -; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP46]] -; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP65]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP38]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP39]], i32 4, [[ACTIVE_LANE_MASK3]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP10]] +; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP15]] +; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP20]] +; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP25]] +; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP40]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP44]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP40]], i64 [[TMP46]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP47]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = mul i64 [[TMP48]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[TMP40]], i64 [[TMP49]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP50]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = mul i64 [[TMP51]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = getelementptr inbounds float, ptr [[TMP40]], i64 [[TMP52]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP53]], i32 4, [[ACTIVE_LANE_MASK3]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = fmul nnan [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD7]] +; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = fmul nnan [[WIDE_MASKED_LOAD4]], [[WIDE_MASKED_LOAD8]] +; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = fmul nnan [[WIDE_MASKED_LOAD5]], [[WIDE_MASKED_LOAD9]] +; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = fmul nnan [[WIDE_MASKED_LOAD6]], [[WIDE_MASKED_LOAD10]] +; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = select nnan [[ACTIVE_LANE_MASK]], [[TMP54]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP58]]) +; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = select nnan [[ACTIVE_LANE_MASK1]], [[TMP55]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP59]], [[TMP60]]) +; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = select nnan [[ACTIVE_LANE_MASK2]], [[TMP56]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP61]], [[TMP62]]) +; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = select nnan [[ACTIVE_LANE_MASK3]], [[TMP57]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP65]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP63]], [[TMP64]]) ; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = mul i64 [[TMP66]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP67]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP68]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = add i64 [[INDEX]], [[TMP67]] ; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP70]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP71]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = add i64 [[INDEX]], [[TMP70]] ; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = mul i64 [[TMP72]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP73]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP74]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = fmul nnan [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]] -; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = fmul nnan [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]] -; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = fmul nnan [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]] -; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = fmul nnan [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]] -; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = select nnan [[ACTIVE_LANE_MASK]], [[TMP75]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP79]]) -; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = select nnan [[ACTIVE_LANE_MASK6]], [[TMP76]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP80]], [[TMP81]]) -; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = select nnan [[ACTIVE_LANE_MASK7]], [[TMP77]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP82]], [[TMP83]]) -; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = select nnan [[ACTIVE_LANE_MASK8]], [[TMP78]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP86]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP84]], [[TMP85]]) -; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = add i64 [[INDEX]], [[TMP88]] -; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP91:%.*]] = mul i64 [[TMP90]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP92:%.*]] = add i64 [[INDEX]], [[TMP91]] -; CHECK-ORDERED-TF-NEXT: [[TMP93:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP94:%.*]] = mul i64 [[TMP93]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP95:%.*]] = add i64 [[INDEX]], [[TMP94]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP89]], i64 [[TMP20]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP92]], i64 [[TMP25]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP95]], i64 [[TMP30]]) -; CHECK-ORDERED-TF-NEXT: [[TMP96:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP97:%.*]] = mul i64 [[TMP96]], 32 -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP97]] -; CHECK-ORDERED-TF-NEXT: [[TMP98:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP99:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP100:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP101:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP102:%.*]] = extractelement [[TMP98]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP102]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = add i64 [[INDEX]], [[TMP73]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT11]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP68]], i64 [[TMP9]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP71]], i64 [[TMP9]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP74]], i64 [[TMP9]]) +; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = mul i64 [[TMP75]], 32 +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP76]] +; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = extractelement [[TMP77]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP81]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP65]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP103:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP104:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; CHECK-ORDERED-TF-NEXT: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP103]], float [[TMP104]], float [[SUM_07]]) +; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP82]], float [[TMP83]], float [[SUM_07]]) ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP65]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: ret float [[MULADD_LCSSA]] ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll @@ -19,109 +19,76 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 -; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 12 -; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 16 -; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[UMAX]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[UMAX]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 16 -; CHECK-NEXT: [[TMP18:%.*]] = sub i64 [[UMAX]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[UMAX]], [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 16 -; CHECK-NEXT: [[TMP23:%.*]] = sub i64 [[UMAX]], [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = icmp ugt i64 [[UMAX]], [[TMP22]] -; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 0 -; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 16 -; CHECK-NEXT: [[TMP28:%.*]] = sub i64 [[UMAX]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = icmp ugt i64 [[UMAX]], [[TMP27]] -; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i64 [[TMP28]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement poison, i32 [[VAL]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector [[BROADCAST_SPLATINSERT10]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement poison, i32 [[VAL]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector [[BROADCAST_SPLATINSERT12]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement poison, i32 [[VAL]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector [[BROADCAST_SPLATINSERT14]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX6]], 0 -; CHECK-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 4 -; CHECK-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 0 -; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 1 -; CHECK-NEXT: [[TMP36:%.*]] = add i64 [[INDEX6]], [[TMP35]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX1]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 8 +; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX1]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 12 +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[TMP22]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 1 +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX1]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i32, ptr [[TMP26]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP30]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 4 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, ptr [[TMP26]], i64 [[TMP32]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP33]], i32 4, [[ACTIVE_LANE_MASK2]]) +; CHECK-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 8 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[TMP26]], i64 [[TMP35]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP36]], i32 4, [[ACTIVE_LANE_MASK3]]) ; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 8 -; CHECK-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 0 -; CHECK-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 1 -; CHECK-NEXT: [[TMP41:%.*]] = add i64 [[INDEX6]], [[TMP40]] -; CHECK-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 12 -; CHECK-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 0 -; CHECK-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 1 -; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[INDEX6]], [[TMP45]] -; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP31]] -; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP36]] -; CHECK-NEXT: [[TMP49:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP41]] -; CHECK-NEXT: [[TMP50:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP46]] -; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i32, ptr [[TMP47]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 4 -; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP53]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT11]], ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK7]]) -; CHECK-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 8 -; CHECK-NEXT: [[TMP57:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP56]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT13]], ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK8]]) -; CHECK-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 12 -; CHECK-NEXT: [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT15]], ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK9]]) -; CHECK-NEXT: [[TMP61:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP61]], 4 -; CHECK-NEXT: [[TMP63:%.*]] = add i64 [[INDEX6]], [[TMP62]] -; CHECK-NEXT: [[TMP64:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP65:%.*]] = mul i64 [[TMP64]], 8 -; CHECK-NEXT: [[TMP66:%.*]] = add i64 [[INDEX6]], [[TMP65]] -; CHECK-NEXT: [[TMP67:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP68:%.*]] = mul i64 [[TMP67]], 12 -; CHECK-NEXT: [[TMP69:%.*]] = add i64 [[INDEX6]], [[TMP68]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP15]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP63]], i64 [[TMP20]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP66]], i64 [[TMP25]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP69]], i64 [[TMP30]]) -; CHECK-NEXT: [[TMP70:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP71:%.*]] = mul i64 [[TMP70]], 16 -; CHECK-NEXT: [[INDEX_NEXT19]] = add i64 [[INDEX6]], [[TMP71]] -; CHECK-NEXT: [[TMP72:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP73:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP74:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP75:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP76:%.*]] = extractelement [[TMP72]], i32 0 -; CHECK-NEXT: br i1 [[TMP76]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 12 +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr i32, ptr [[TMP26]], i64 [[TMP38]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP39]], i32 4, [[ACTIVE_LANE_MASK4]]) +; CHECK-NEXT: [[TMP40:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP40]], 4 +; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[INDEX1]], [[TMP41]] +; CHECK-NEXT: [[TMP43:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP44:%.*]] = mul i64 [[TMP43]], 8 +; CHECK-NEXT: [[TMP45:%.*]] = add i64 [[INDEX1]], [[TMP44]] +; CHECK-NEXT: [[TMP46:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], 12 +; CHECK-NEXT: [[TMP48:%.*]] = add i64 [[INDEX1]], [[TMP47]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP42]], i64 [[TMP9]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP45]], i64 [[TMP9]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT7]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP48]], i64 [[TMP9]]) +; CHECK-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP49]], 16 +; CHECK-NEXT: [[INDEX_NEXT8]] = add i64 [[INDEX1]], [[TMP50]] +; CHECK-NEXT: [[TMP51:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP52:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT5]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP53:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT6]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP54:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP55:%.*]] = extractelement [[TMP51]], i32 0 +; CHECK-NEXT: br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -167,135 +134,102 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 -; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 12 -; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 16 -; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[UMAX]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[UMAX]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 16 -; CHECK-NEXT: [[TMP18:%.*]] = sub i64 [[UMAX]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[UMAX]], [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 16 -; CHECK-NEXT: [[TMP23:%.*]] = sub i64 [[UMAX]], [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = icmp ugt i64 [[UMAX]], [[TMP22]] -; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 0 -; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 16 -; CHECK-NEXT: [[TMP28:%.*]] = sub i64 [[UMAX]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = icmp ugt i64 [[UMAX]], [[TMP27]] -; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i64 [[TMP28]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement poison, i32 [[VAL]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector [[BROADCAST_SPLATINSERT13]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement poison, i32 [[VAL]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector [[BROADCAST_SPLATINSERT15]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement poison, i32 [[VAL]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector [[BROADCAST_SPLATINSERT17]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT19:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT20:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT21:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX6]], 0 -; CHECK-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 4 -; CHECK-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 0 -; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 1 -; CHECK-NEXT: [[TMP36:%.*]] = add i64 [[INDEX6]], [[TMP35]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX1]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 8 +; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX1]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 12 +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[TMP22]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 1 +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX1]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i32, ptr [[COND_PTR:%.*]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i32, ptr [[TMP26]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP30]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 4 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, ptr [[TMP26]], i64 [[TMP32]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP33]], i32 4, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 8 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[TMP26]], i64 [[TMP35]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP36]], i32 4, [[ACTIVE_LANE_MASK3]], poison) ; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 8 -; CHECK-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 0 -; CHECK-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 1 -; CHECK-NEXT: [[TMP41:%.*]] = add i64 [[INDEX6]], [[TMP40]] -; CHECK-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 12 -; CHECK-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 0 -; CHECK-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 1 -; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[INDEX6]], [[TMP45]] -; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i32, ptr [[COND_PTR:%.*]], i64 [[TMP31]] -; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP36]] -; CHECK-NEXT: [[TMP49:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP41]] -; CHECK-NEXT: [[TMP50:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP46]] -; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i32, ptr [[TMP47]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 4 -; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP53]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK7]], poison) -; CHECK-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 8 -; CHECK-NEXT: [[TMP57:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP56]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 12 -; CHECK-NEXT: [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK9]], poison) -; CHECK-NEXT: [[TMP61:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP62:%.*]] = icmp ne [[WIDE_MASKED_LOAD10]], zeroinitializer -; CHECK-NEXT: [[TMP63:%.*]] = icmp ne [[WIDE_MASKED_LOAD11]], zeroinitializer -; CHECK-NEXT: [[TMP64:%.*]] = icmp ne [[WIDE_MASKED_LOAD12]], zeroinitializer -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP31]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP36]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP41]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP46]] -; CHECK-NEXT: [[TMP69:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP61]], zeroinitializer -; CHECK-NEXT: [[TMP70:%.*]] = select [[ACTIVE_LANE_MASK7]], [[TMP62]], zeroinitializer -; CHECK-NEXT: [[TMP71:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP63]], zeroinitializer -; CHECK-NEXT: [[TMP72:%.*]] = select [[ACTIVE_LANE_MASK9]], [[TMP64]], zeroinitializer -; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, ptr [[TMP65]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP73]], i32 4, [[TMP69]]) -; CHECK-NEXT: [[TMP74:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP75:%.*]] = mul i64 [[TMP74]], 4 -; CHECK-NEXT: [[TMP76:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP75]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT14]], ptr [[TMP76]], i32 4, [[TMP70]]) -; CHECK-NEXT: [[TMP77:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP77]], 8 -; CHECK-NEXT: [[TMP79:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP78]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT16]], ptr [[TMP79]], i32 4, [[TMP71]]) -; CHECK-NEXT: [[TMP80:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP81:%.*]] = mul i64 [[TMP80]], 12 -; CHECK-NEXT: [[TMP82:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP81]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT18]], ptr [[TMP82]], i32 4, [[TMP72]]) -; CHECK-NEXT: [[TMP83:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP84:%.*]] = mul i64 [[TMP83]], 4 -; CHECK-NEXT: [[TMP85:%.*]] = add i64 [[INDEX6]], [[TMP84]] -; CHECK-NEXT: [[TMP86:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP87:%.*]] = mul i64 [[TMP86]], 8 -; CHECK-NEXT: [[TMP88:%.*]] = add i64 [[INDEX6]], [[TMP87]] -; CHECK-NEXT: [[TMP89:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP90:%.*]] = mul i64 [[TMP89]], 12 -; CHECK-NEXT: [[TMP91:%.*]] = add i64 [[INDEX6]], [[TMP90]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP15]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT19]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP85]], i64 [[TMP20]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT20]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP88]], i64 [[TMP25]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT21]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP91]], i64 [[TMP30]]) -; CHECK-NEXT: [[TMP92:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP93:%.*]] = mul i64 [[TMP92]], 16 -; CHECK-NEXT: [[INDEX_NEXT22]] = add i64 [[INDEX6]], [[TMP93]] -; CHECK-NEXT: [[TMP94:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP95:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT19]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP96:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP97:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP98:%.*]] = extractelement [[TMP94]], i32 0 -; CHECK-NEXT: br i1 [[TMP98]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 12 +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr i32, ptr [[TMP26]], i64 [[TMP38]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP39]], i32 4, [[ACTIVE_LANE_MASK4]], poison) +; CHECK-NEXT: [[TMP40:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP41:%.*]] = icmp ne [[WIDE_MASKED_LOAD5]], zeroinitializer +; CHECK-NEXT: [[TMP42:%.*]] = icmp ne [[WIDE_MASKED_LOAD6]], zeroinitializer +; CHECK-NEXT: [[TMP43:%.*]] = icmp ne [[WIDE_MASKED_LOAD7]], zeroinitializer +; CHECK-NEXT: [[TMP44:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP45:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP46:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP48:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP40]], zeroinitializer +; CHECK-NEXT: [[TMP49:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP41]], zeroinitializer +; CHECK-NEXT: [[TMP50:%.*]] = select [[ACTIVE_LANE_MASK3]], [[TMP42]], zeroinitializer +; CHECK-NEXT: [[TMP51:%.*]] = select [[ACTIVE_LANE_MASK4]], [[TMP43]], zeroinitializer +; CHECK-NEXT: [[TMP52:%.*]] = getelementptr i32, ptr [[TMP44]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP52]], i32 4, [[TMP48]]) +; CHECK-NEXT: [[TMP53:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP54:%.*]] = mul i64 [[TMP53]], 4 +; CHECK-NEXT: [[TMP55:%.*]] = getelementptr i32, ptr [[TMP44]], i64 [[TMP54]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP55]], i32 4, [[TMP49]]) +; CHECK-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP57:%.*]] = mul i64 [[TMP56]], 8 +; CHECK-NEXT: [[TMP58:%.*]] = getelementptr i32, ptr [[TMP44]], i64 [[TMP57]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP58]], i32 4, [[TMP50]]) +; CHECK-NEXT: [[TMP59:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP60:%.*]] = mul i64 [[TMP59]], 12 +; CHECK-NEXT: [[TMP61:%.*]] = getelementptr i32, ptr [[TMP44]], i64 [[TMP60]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP61]], i32 4, [[TMP51]]) +; CHECK-NEXT: [[TMP62:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP63:%.*]] = mul i64 [[TMP62]], 4 +; CHECK-NEXT: [[TMP64:%.*]] = add i64 [[INDEX1]], [[TMP63]] +; CHECK-NEXT: [[TMP65:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP65]], 8 +; CHECK-NEXT: [[TMP67:%.*]] = add i64 [[INDEX1]], [[TMP66]] +; CHECK-NEXT: [[TMP68:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP69:%.*]] = mul i64 [[TMP68]], 12 +; CHECK-NEXT: [[TMP70:%.*]] = add i64 [[INDEX1]], [[TMP69]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT8]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP64]], i64 [[TMP9]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT9]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP67]], i64 [[TMP9]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT10]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP70]], i64 [[TMP9]]) +; CHECK-NEXT: [[TMP71:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP72:%.*]] = mul i64 [[TMP71]], 16 +; CHECK-NEXT: [[INDEX_NEXT11]] = add i64 [[INDEX1]], [[TMP72]] +; CHECK-NEXT: [[TMP73:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP74:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP75:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP76:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT10]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP77:%.*]] = extractelement [[TMP73]], i32 0 +; CHECK-NEXT: br i1 [[TMP77]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll @@ -62,8 +62,8 @@ ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[NEXT_GEP4]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996 -; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996 +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.body: ; CHECK-NEXT: [[A_ADDR_09:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ] @@ -741,8 +741,6 @@ ; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 39968 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] @@ -754,7 +752,7 @@ ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER5]], [[BROADCAST_SPLAT7]] +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER5]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[NEXT_GEP]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4 ; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[TMP5]], align 4 @@ -808,12 +806,6 @@ ; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 39936 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT12]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT14]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] @@ -829,9 +821,9 @@ ; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER7]], [[BROADCAST_SPLAT11]] -; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER8]], [[BROADCAST_SPLAT13]] -; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER9]], [[BROADCAST_SPLAT15]] +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER7]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER8]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER9]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[NEXT_GEP]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4 ; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[TMP9]], align 4 @@ -881,10 +873,10 @@ define hidden void @mult_ptr_iv(ptr noalias nocapture readonly %x, ptr noalias nocapture %z) { ; CHECK-LABEL: @mult_ptr_iv( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[Z:%.*]], i32 3000 -; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[X:%.*]], i32 3000 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[Z]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[X]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[Z:%.*]], i32 3000 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[X:%.*]], i32 3000 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[Z]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[X]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll --- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll @@ -152,12 +152,6 @@ ; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT2]], <16 x i16> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[OFF]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT]], <16 x i16> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i16> poison, i16 [[OFF]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT7]], <16 x i16> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <16 x i16> poison, i16 [[OFF]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT9]], <16 x i16> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <16 x i16> poison, i16 [[OFF]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT11]], <16 x i16> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16 ; CHECK-NEXT: [[IND_END:%.*]] = mul i16 [[DOTCAST]], [[TMP0]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -172,9 +166,9 @@ ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 32 ; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 48 ; CHECK-NEXT: [[TMP7:%.*]] = sub <16 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP8:%.*]] = sub <16 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT8]] -; CHECK-NEXT: [[TMP9:%.*]] = sub <16 x i16> [[STEP_ADD4]], [[BROADCAST_SPLAT10]] -; CHECK-NEXT: [[TMP10:%.*]] = sub <16 x i16> [[STEP_ADD5]], [[BROADCAST_SPLAT12]] +; CHECK-NEXT: [[TMP8:%.*]] = sub <16 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP9:%.*]] = sub <16 x i16> [[STEP_ADD4]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = sub <16 x i16> [[STEP_ADD5]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[K:%.*]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[K]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[K]], i64 [[TMP5]] @@ -195,52 +189,52 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[L]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[DOTCAST18:%.*]] = trunc i64 [[N_VEC]] to i16 -; CHECK-NEXT: [[IND_END19:%.*]] = mul i16 [[DOTCAST18]], [[TMP0]] +; CHECK-NEXT: [[DOTCAST12:%.*]] = trunc i64 [[N_VEC]] to i16 +; CHECK-NEXT: [[IND_END13:%.*]] = mul i16 [[DOTCAST12]], [[TMP0]] ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[L]], [[N_VEC]] ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF13:%.*]] = urem i64 [[L]], 8 -; CHECK-NEXT: [[N_VEC14:%.*]] = sub i64 [[L]], [[N_MOD_VF13]] -; CHECK-NEXT: [[DOTCAST16:%.*]] = trunc i64 [[N_VEC14]] to i16 -; CHECK-NEXT: [[IND_END17:%.*]] = mul i16 [[DOTCAST16]], [[TMP0]] -; CHECK-NEXT: [[DOTSPLATINSERT23:%.*]] = insertelement <8 x i16> poison, i16 [[BC_RESUME_VAL]], i64 0 -; CHECK-NEXT: [[DOTSPLAT24:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT23]], <8 x i16> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[DOTSPLATINSERT25:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0 -; CHECK-NEXT: [[DOTSPLAT26:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT25]], <8 x i16> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = mul <8 x i16> , [[DOTSPLAT26]] -; CHECK-NEXT: [[INDUCTION27:%.*]] = add <8 x i16> [[DOTSPLAT24]], [[TMP20]] +; CHECK-NEXT: [[N_MOD_VF7:%.*]] = urem i64 [[L]], 8 +; CHECK-NEXT: [[N_VEC8:%.*]] = sub i64 [[L]], [[N_MOD_VF7]] +; CHECK-NEXT: [[DOTCAST10:%.*]] = trunc i64 [[N_VEC8]] to i16 +; CHECK-NEXT: [[IND_END11:%.*]] = mul i16 [[DOTCAST10]], [[TMP0]] +; CHECK-NEXT: [[DOTSPLATINSERT17:%.*]] = insertelement <8 x i16> poison, i16 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT18:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT17]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT19:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0 +; CHECK-NEXT: [[DOTSPLAT20:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT19]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = mul <8 x i16> , [[DOTSPLAT20]] +; CHECK-NEXT: [[INDUCTION21:%.*]] = add <8 x i16> [[DOTSPLAT18]], [[TMP20]] ; CHECK-NEXT: [[TMP21:%.*]] = mul i16 [[TMP0]], 8 -; CHECK-NEXT: [[DOTSPLATINSERT28:%.*]] = insertelement <8 x i16> poison, i16 [[TMP21]], i64 0 -; CHECK-NEXT: [[DOTSPLAT29:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT28]], <8 x i16> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT33:%.*]] = insertelement <8 x i16> poison, i16 [[OFF]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT34:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT33]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT22:%.*]] = insertelement <8 x i16> poison, i16 [[TMP21]], i64 0 +; CHECK-NEXT: [[DOTSPLAT23:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT22]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <8 x i16> poison, i16 [[OFF]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT27]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX22:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT35:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND30:%.*]] = phi <8 x i16> [ [[INDUCTION27]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT32:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX22]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = sub <8 x i16> [[VEC_IND30]], [[BROADCAST_SPLAT34]] +; CHECK-NEXT: [[INDEX16:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT29:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND24:%.*]] = phi <8 x i16> [ [[INDUCTION21]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX16]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = sub <8 x i16> [[VEC_IND24]], [[BROADCAST_SPLAT28]] ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[K]], i64 [[TMP22]] ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[TMP24]], i32 0 ; CHECK-NEXT: store <8 x i16> [[TMP23]], ptr [[TMP25]], align 2 -; CHECK-NEXT: [[INDEX_NEXT35]] = add nuw i64 [[INDEX22]], 8 -; CHECK-NEXT: [[VEC_IND_NEXT32]] = add <8 x i16> [[VEC_IND30]], [[DOTSPLAT29]] -; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT35]], [[N_VEC14]] +; CHECK-NEXT: [[INDEX_NEXT29]] = add nuw i64 [[INDEX16]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT26]] = add <8 x i16> [[VEC_IND24]], [[DOTSPLAT23]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT29]], [[N_VEC8]] ; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N21:%.*]] = icmp eq i64 [[L]], [[N_VEC14]] -; CHECK-NEXT: br i1 [[CMP_N21]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[L]], [[N_VEC8]] +; CHECK-NEXT: br i1 [[CMP_N15]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i64 [ [[N_VEC14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL20:%.*]] = phi i16 [ [[IND_END17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL9:%.*]] = phi i64 [ [[N_VEC8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL14:%.*]] = phi i16 [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END13]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL15]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[P_09:%.*]] = phi i16 [ [[BC_RESUME_VAL20]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL9]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[P_09:%.*]] = phi i16 [ [[BC_RESUME_VAL14]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[ADD]] = sub i16 [[P_09]], [[OFF]] ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[K]], i64 [[IV]] ; CHECK-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX3]], align 2 diff --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll --- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll @@ -29,12 +29,6 @@ ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967232 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[OFFSET:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT]], <16 x i16> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <16 x i16> poison, i16 [[OFFSET]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT12]], <16 x i16> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <16 x i16> poison, i16 [[OFFSET]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT14]], <16 x i16> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <16 x i16> poison, i16 [[OFFSET]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT17:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT16]], <16 x i16> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -50,9 +44,9 @@ ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 48 ; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i16>, ptr [[TMP5]], align 2 ; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD]], <16 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD9]], <16 x i16> [[BROADCAST_SPLAT13]]) -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD10]], <16 x i16> [[BROADCAST_SPLAT15]]) -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD11]], <16 x i16> [[BROADCAST_SPLAT17]]) +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD9]], <16 x i16> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD10]], <16 x i16> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD11]], <16 x i16> [[BROADCAST_SPLAT]]) ; CHECK-NEXT: store <16 x i16> [[TMP6]], ptr [[NEXT_GEP5]], align 2 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[NEXT_GEP5]], i64 16 ; CHECK-NEXT: store <16 x i16> [[TMP7]], ptr [[TMP10]], align 2 @@ -68,50 +62,50 @@ ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[IND_END30:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP14]] +; CHECK-NEXT: [[IND_END24:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP14]] ; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[IND_END27:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP15]] -; CHECK-NEXT: [[DOTCAST23:%.*]] = trunc i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[IND_END24:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST23]] +; CHECK-NEXT: [[IND_END21:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP15]] +; CHECK-NEXT: [[DOTCAST17:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END18:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST17]] ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 56 ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC21:%.*]] = and i64 [[TMP0]], 4294967288 -; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC21]] to i32 -; CHECK-NEXT: [[IND_END22:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] -; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[N_VEC21]], 1 -; CHECK-NEXT: [[IND_END26:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[N_VEC21]], 1 -; CHECK-NEXT: [[IND_END29:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP17]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT37:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT38:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT37]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[N_VEC15:%.*]] = and i64 [[TMP0]], 4294967288 +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC15]] to i32 +; CHECK-NEXT: [[IND_END16:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[N_VEC15]], 1 +; CHECK-NEXT: [[IND_END20:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[N_VEC15]], 1 +; CHECK-NEXT: [[IND_END23:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP17]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT31:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT32:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT31]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX33:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT39:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP18:%.*]] = shl i64 [[INDEX33]], 1 -; CHECK-NEXT: [[NEXT_GEP34:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = shl i64 [[INDEX33]], 1 -; CHECK-NEXT: [[NEXT_GEP35:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP19]] -; CHECK-NEXT: [[WIDE_LOAD36:%.*]] = load <8 x i16>, ptr [[NEXT_GEP34]], align 2 -; CHECK-NEXT: [[TMP20:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[WIDE_LOAD36]], <8 x i16> [[BROADCAST_SPLAT38]]) -; CHECK-NEXT: store <8 x i16> [[TMP20]], ptr [[NEXT_GEP35]], align 2 -; CHECK-NEXT: [[INDEX_NEXT39]] = add nuw i64 [[INDEX33]], 8 -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT39]], [[N_VEC21]] +; CHECK-NEXT: [[INDEX27:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT33:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP18:%.*]] = shl i64 [[INDEX27]], 1 +; CHECK-NEXT: [[NEXT_GEP28:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP19:%.*]] = shl i64 [[INDEX27]], 1 +; CHECK-NEXT: [[NEXT_GEP29:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP19]] +; CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <8 x i16>, ptr [[NEXT_GEP28]], align 2 +; CHECK-NEXT: [[TMP20:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[WIDE_LOAD30]], <8 x i16> [[BROADCAST_SPLAT32]]) +; CHECK-NEXT: store <8 x i16> [[TMP20]], ptr [[NEXT_GEP29]], align 2 +; CHECK-NEXT: [[INDEX_NEXT33]] = add nuw i64 [[INDEX27]], 8 +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT33]], [[N_VEC15]] ; CHECK-NEXT: br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N32:%.*]] = icmp eq i64 [[N_VEC21]], [[TMP0]] -; CHECK-NEXT: br i1 [[CMP_N32]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N26:%.*]] = icmp eq i64 [[N_VEC15]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP_N26]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL25:%.*]] = phi i32 [ [[IND_END22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END24]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL28:%.*]] = phi ptr [ [[IND_END26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END27]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL31:%.*]] = phi ptr [ [[IND_END29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END30]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL19:%.*]] = phi i32 [ [[IND_END16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END18]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL22:%.*]] = phi ptr [ [[IND_END20]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END21]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL25:%.*]] = phi ptr [ [[IND_END23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END24]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: -; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL25]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL28]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL31]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL19]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL22]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL25]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, ptr [[PSRC_ADDR_08]], i64 1 ; CHECK-NEXT: [[TMP22:%.*]] = load i16, ptr [[PSRC_ADDR_08]], align 2 ; CHECK-NEXT: [[TMP23:%.*]] = tail call i16 @llvm.uadd.sat.i16(i16 [[TMP22]], i16 [[OFFSET]]) @@ -168,12 +162,6 @@ ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967168 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i8> poison, i8 [[OFFSET:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i8> [[BROADCAST_SPLATINSERT]], <32 x i8> poison, <32 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <32 x i8> poison, i8 [[OFFSET]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <32 x i8> [[BROADCAST_SPLATINSERT12]], <32 x i8> poison, <32 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <32 x i8> poison, i8 [[OFFSET]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <32 x i8> [[BROADCAST_SPLATINSERT14]], <32 x i8> poison, <32 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <32 x i8> poison, i8 [[OFFSET]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT17:%.*]] = shufflevector <32 x i8> [[BROADCAST_SPLATINSERT16]], <32 x i8> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -187,9 +175,9 @@ ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 96 ; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <32 x i8>, ptr [[TMP3]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD]], <32 x i8> [[WIDE_LOAD]], <32 x i8> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP5:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD9]], <32 x i8> [[WIDE_LOAD9]], <32 x i8> [[BROADCAST_SPLAT13]]) -; CHECK-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD10]], <32 x i8> [[WIDE_LOAD10]], <32 x i8> [[BROADCAST_SPLAT15]]) -; CHECK-NEXT: [[TMP7:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD11]], <32 x i8> [[WIDE_LOAD11]], <32 x i8> [[BROADCAST_SPLAT17]]) +; CHECK-NEXT: [[TMP5:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD9]], <32 x i8> [[WIDE_LOAD9]], <32 x i8> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD10]], <32 x i8> [[WIDE_LOAD10]], <32 x i8> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP7:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD11]], <32 x i8> [[WIDE_LOAD11]], <32 x i8> [[BROADCAST_SPLAT]]) ; CHECK-NEXT: store <32 x i8> [[TMP4]], ptr [[NEXT_GEP5]], align 2 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 32 ; CHECK-NEXT: store <32 x i8> [[TMP5]], ptr [[TMP8]], align 2 @@ -204,45 +192,45 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[IND_END30:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]] -; CHECK-NEXT: [[IND_END27:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]] -; CHECK-NEXT: [[DOTCAST23:%.*]] = trunc i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[IND_END24:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST23]] +; CHECK-NEXT: [[IND_END24:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]] +; CHECK-NEXT: [[IND_END21:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]] +; CHECK-NEXT: [[DOTCAST17:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END18:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST17]] ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 112 ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC21:%.*]] = and i64 [[TMP0]], 4294967280 -; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC21]] to i32 -; CHECK-NEXT: [[IND_END22:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] -; CHECK-NEXT: [[IND_END26:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC21]] -; CHECK-NEXT: [[IND_END29:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC21]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT37:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT38:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT37]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[N_VEC15:%.*]] = and i64 [[TMP0]], 4294967280 +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC15]] to i32 +; CHECK-NEXT: [[IND_END16:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] +; CHECK-NEXT: [[IND_END20:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC15]] +; CHECK-NEXT: [[IND_END23:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC15]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT31:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT32:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT31]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX33:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT39:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[NEXT_GEP34:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[INDEX33]] -; CHECK-NEXT: [[NEXT_GEP35:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[INDEX33]] -; CHECK-NEXT: [[WIDE_LOAD36:%.*]] = load <16 x i8>, ptr [[NEXT_GEP34]], align 2 -; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[WIDE_LOAD36]], <16 x i8> [[WIDE_LOAD36]], <16 x i8> [[BROADCAST_SPLAT38]]) -; CHECK-NEXT: store <16 x i8> [[TMP12]], ptr [[NEXT_GEP35]], align 2 -; CHECK-NEXT: [[INDEX_NEXT39]] = add nuw i64 [[INDEX33]], 16 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT39]], [[N_VEC21]] +; CHECK-NEXT: [[INDEX27:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT33:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP28:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[INDEX27]] +; CHECK-NEXT: [[NEXT_GEP29:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[INDEX27]] +; CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <16 x i8>, ptr [[NEXT_GEP28]], align 2 +; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[WIDE_LOAD30]], <16 x i8> [[WIDE_LOAD30]], <16 x i8> [[BROADCAST_SPLAT32]]) +; CHECK-NEXT: store <16 x i8> [[TMP12]], ptr [[NEXT_GEP29]], align 2 +; CHECK-NEXT: [[INDEX_NEXT33]] = add nuw i64 [[INDEX27]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT33]], [[N_VEC15]] ; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N32:%.*]] = icmp eq i64 [[N_VEC21]], [[TMP0]] -; CHECK-NEXT: br i1 [[CMP_N32]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N26:%.*]] = icmp eq i64 [[N_VEC15]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP_N26]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL25:%.*]] = phi i32 [ [[IND_END22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END24]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL28:%.*]] = phi ptr [ [[IND_END26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END27]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL31:%.*]] = phi ptr [ [[IND_END29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END30]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL19:%.*]] = phi i32 [ [[IND_END16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END18]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL22:%.*]] = phi ptr [ [[IND_END20]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END21]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL25:%.*]] = phi ptr [ [[IND_END23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END24]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: -; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL25]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL28]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL31]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL19]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL22]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL25]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PSRC_ADDR_08]], i64 1 ; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[PSRC_ADDR_08]], align 2 ; CHECK-NEXT: [[TMP15:%.*]] = tail call i8 @llvm.fshl.i8(i8 [[TMP14]], i8 [[TMP14]], i8 [[OFFSET]]) diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll --- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -23,12 +23,6 @@ ; CHECK: vector.ph: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[LEN:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[LEN]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i64> poison, i64 [[LEN]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT9]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x i64> poison, i64 [[LEN]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT11]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -45,9 +39,9 @@ ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 ; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT8]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <4 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT10]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp slt <4 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT12]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <4 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp slt <4 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP2]] @@ -55,32 +49,32 @@ ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP8]], i32 4 -; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP8]], i32 8 -; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP8]], i32 12 -; CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP4]], ; CHECK-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP5]], ; CHECK-NEXT: [[TMP18:%.*]] = xor <4 x i1> [[TMP6]], ; CHECK-NEXT: [[TMP19:%.*]] = xor <4 x i1> [[TMP7]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI16:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[WIDE_LOAD13]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI17:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[WIDE_LOAD14]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI18:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> [[WIDE_LOAD15]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[WIDE_LOAD8]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> [[WIDE_LOAD9]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP20]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP21]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI16]] -; CHECK-NEXT: [[TMP22]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI17]] -; CHECK-NEXT: [[TMP23]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI18]] +; CHECK-NEXT: [[TMP21]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI10]] +; CHECK-NEXT: [[TMP22]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI11]] +; CHECK-NEXT: [[TMP23]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI12]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], ; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP21]], [[TMP20]] -; CHECK-NEXT: [[BIN_RDX19:%.*]] = add <4 x i32> [[TMP22]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX20:%.*]] = add <4 x i32> [[TMP23]], [[BIN_RDX19]] -; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX20]]) +; CHECK-NEXT: [[BIN_RDX13:%.*]] = add <4 x i32> [[TMP22]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX14:%.*]] = add <4 x i32> [[TMP23]], [[BIN_RDX13]] +; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX14]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll @@ -44,20 +44,8 @@ ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP8]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr addrspace(10)> poison, ptr addrspace(10) [[DOTUNPACK]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr addrspace(10)> [[BROADCAST_SPLATINSERT]], <4 x ptr addrspace(10)> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x ptr addrspace(10)> poison, ptr addrspace(10) [[DOTUNPACK]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x ptr addrspace(10)> [[BROADCAST_SPLATINSERT7]], <4 x ptr addrspace(10)> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x ptr addrspace(10)> poison, ptr addrspace(10) [[DOTUNPACK]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x ptr addrspace(10)> [[BROADCAST_SPLATINSERT9]], <4 x ptr addrspace(10)> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x ptr addrspace(10)> poison, ptr addrspace(10) [[DOTUNPACK]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x ptr addrspace(10)> [[BROADCAST_SPLATINSERT11]], <4 x ptr addrspace(10)> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <4 x i64> poison, i64 [[DOTUNPACK2]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT13]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <4 x i64> poison, i64 [[DOTUNPACK2]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT15]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x i64> poison, i64 [[DOTUNPACK2]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT17]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x i64> poison, i64 [[DOTUNPACK2]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT19]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[DOTUNPACK2]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -70,17 +58,17 @@ ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD4]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD5]], i32 0 ; CHECK-NEXT: call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP18]], i32 8, <4 x i1> ), !tbaa [[TBAA10:![0-9]+]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP19]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT10]], <4 x ptr addrspace(13)> [[TMP20]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT12]], <4 x ptr addrspace(13)> [[TMP21]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP19]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP20]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP21]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND]], i32 1 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD]], i32 1 ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD4]], i32 1 ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD5]], i32 1 -; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT14]], <4 x ptr addrspace(13)> [[TMP22]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT16]], <4 x ptr addrspace(13)> [[TMP23]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT18]], <4 x ptr addrspace(13)> [[TMP24]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT20]], <4 x ptr addrspace(13)> [[TMP25]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP22]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP23]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP24]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP25]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD5]], ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -497,8 +497,6 @@ ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i32 3 ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[CONV1]], i64 0 ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x double> poison, double [[CONV1]], i64 0 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT2]], <4 x double> poison, <4 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -519,7 +517,7 @@ ; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = sitofp <4 x i16> [[TMP9]] to <4 x double> ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = sitofp <4 x i16> [[TMP10]] to <4 x double> ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = fmul fast <4 x double> [[TMP13]], [[BROADCAST_SPLAT]] -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = fmul fast <4 x double> [[TMP14]], [[BROADCAST_SPLAT3]] +; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = fmul fast <4 x double> [[TMP14]], [[BROADCAST_SPLAT]] ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = fsub fast <4 x double> [[TMP11]], [[TMP15]] ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = fsub fast <4 x double> [[TMP12]], [[TMP16]] ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP3]] @@ -1393,8 +1391,6 @@ ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0 ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1402,7 +1398,7 @@ ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], ; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; UNROLL-NO-IC-NEXT: [[TMP1]] = add <4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT3]] +; UNROLL-NO-IC-NEXT: [[TMP1]] = add <4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]] ; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP0]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll --- a/llvm/test/Transforms/LoopVectorize/float-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll @@ -852,8 +852,6 @@ ; VEC4_INTERL2-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT8]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 ; VEC4_INTERL2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL2-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 -; VEC4_INTERL2-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT14]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL2: vector.body: ; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -865,7 +863,7 @@ ; VEC4_INTERL2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 4 ; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD11]], ptr [[TMP7]], align 4 ; VEC4_INTERL2-NEXT: [[TMP8:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST_SPLAT]] -; VEC4_INTERL2-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[STEP_ADD11]], [[BROADCAST_SPLAT15]] +; VEC4_INTERL2-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[STEP_ADD11]], [[BROADCAST_SPLAT]] ; VEC4_INTERL2-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[VEC_IND]], ; VEC4_INTERL2-NEXT: [[TMP11:%.*]] = fadd fast <4 x float> [[VEC_IND]], ; VEC4_INTERL2-NEXT: [[TMP12:%.*]] = fadd fast <4 x float> [[TMP10]], [[TMP8]] diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -405,8 +405,6 @@ ; UNROLL-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4 ; UNROLL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[B:%.*]], i64 0 ; UNROLL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer -; UNROLL-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <2 x float> poison, float [[B]], i64 0 -; UNROLL-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT7]], <2 x float> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -421,7 +419,7 @@ ; UNROLL-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 2 ; UNROLL-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x float>, ptr [[TMP10]], align 4, !alias.scope !7 ; UNROLL-NEXT: [[TMP11:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD5]] -; UNROLL-NEXT: [[TMP12:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT8]], [[WIDE_LOAD6]] +; UNROLL-NEXT: [[TMP12:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD6]] ; UNROLL-NEXT: [[TMP13:%.*]] = fadd fast <2 x float> [[WIDE_LOAD]], [[TMP11]] ; UNROLL-NEXT: [[TMP14:%.*]] = fadd fast <2 x float> [[WIDE_LOAD4]], [[TMP12]] ; UNROLL-NEXT: store <2 x float> [[TMP13]], ptr [[TMP6]], align 4, !alias.scope !4, !noalias !7 @@ -475,8 +473,6 @@ ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[B:%.*]], i64 0 ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <2 x float> poison, float [[B]], i64 0 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT7]], <2 x float> poison, <2 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -499,7 +495,7 @@ ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 2 ; UNROLL-NO-IC-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x float>, ptr [[TMP18]], align 4, !alias.scope !7 ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD5]] -; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT8]], [[WIDE_LOAD6]] +; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD6]] ; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = fadd fast <2 x float> [[WIDE_LOAD]], [[TMP19]] ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = fadd fast <2 x float> [[WIDE_LOAD4]], [[TMP20]] ; UNROLL-NO-IC-NEXT: store <2 x float> [[TMP21]], ptr [[TMP11]], align 4, !alias.scope !4, !noalias !7 @@ -552,8 +548,6 @@ ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -8 ; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0 ; INTERLEAVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer -; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x float> poison, float [[B]], i64 0 -; INTERLEAVE-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT7]], <4 x float> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -568,7 +562,7 @@ ; INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 4 ; INTERLEAVE-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP10]], align 4, !alias.scope !7 ; INTERLEAVE-NEXT: [[TMP11:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD5]] -; INTERLEAVE-NEXT: [[TMP12:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT8]], [[WIDE_LOAD6]] +; INTERLEAVE-NEXT: [[TMP12:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD6]] ; INTERLEAVE-NEXT: [[TMP13:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[TMP11]] ; INTERLEAVE-NEXT: [[TMP14:%.*]] = fadd fast <4 x float> [[WIDE_LOAD4]], [[TMP12]] ; INTERLEAVE-NEXT: store <4 x float> [[TMP13]], ptr [[TMP6]], align 4, !alias.scope !4, !noalias !7 @@ -1355,8 +1349,6 @@ ; UNROLL-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804 ; UNROLL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i64 0 ; UNROLL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; UNROLL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[Y]], i64 0 -; UNROLL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1376,7 +1368,7 @@ ; UNROLL-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0 ; UNROLL-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1 ; UNROLL-NEXT: [[TMP15:%.*]] = xor <2 x i32> [[TMP10]], [[BROADCAST_SPLAT]] -; UNROLL-NEXT: [[TMP16:%.*]] = xor <2 x i32> [[TMP14]], [[BROADCAST_SPLAT2]] +; UNROLL-NEXT: [[TMP16:%.*]] = xor <2 x i32> [[TMP14]], [[BROADCAST_SPLAT]] ; UNROLL-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP15]], i64 0 ; UNROLL-NEXT: store i32 [[TMP17]], ptr [[TMP3]], align 8 ; UNROLL-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP15]], i64 1 @@ -1416,8 +1408,6 @@ ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]] ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i64 0 ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[Y]], i64 0 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1438,7 +1428,7 @@ ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP13]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = xor <2 x i32> [[TMP11]], [[BROADCAST_SPLAT]] -; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = xor <2 x i32> [[TMP15]], [[BROADCAST_SPLAT2]] +; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = xor <2 x i32> [[TMP15]], [[BROADCAST_SPLAT]] ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0 ; UNROLL-NO-IC-NEXT: store i32 [[TMP18]], ptr [[TMP4]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1 @@ -1480,8 +1470,6 @@ ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]] ; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i64 0 ; INTERLEAVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i64 0 -; INTERLEAVE-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1505,7 +1493,7 @@ ; INTERLEAVE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> ; INTERLEAVE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> ; INTERLEAVE-NEXT: [[TMP17:%.*]] = xor <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] -; INTERLEAVE-NEXT: [[TMP18:%.*]] = xor <4 x i32> [[STRIDED_VEC2]], [[BROADCAST_SPLAT4]] +; INTERLEAVE-NEXT: [[TMP18:%.*]] = xor <4 x i32> [[STRIDED_VEC2]], [[BROADCAST_SPLAT]] ; INTERLEAVE-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[TMP17]], i64 0 ; INTERLEAVE-NEXT: store i32 [[TMP19]], ptr [[TMP9]], align 8 ; INTERLEAVE-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP17]], i64 1 @@ -2120,12 +2108,12 @@ ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483644 ; UNROLL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0 -; UNROLL-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0 +; UNROLL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: -; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE10:%.*]] ] -; UNROLL-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[PRED_UDIV_CONTINUE10]] ] -; UNROLL-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[PRED_UDIV_CONTINUE10]] ] +; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE8:%.*]] ] +; UNROLL-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[PRED_UDIV_CONTINUE8]] ] +; UNROLL-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[PRED_UDIV_CONTINUE8]] ] ; UNROLL-NEXT: [[TMP0:%.*]] = or i32 [[INDEX]], 2 ; UNROLL-NEXT: [[TMP1:%.*]] = sext i32 [[INDEX]] to i64 ; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]] @@ -2149,48 +2137,44 @@ ; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE4]] ; UNROLL: pred.udiv.continue4: ; UNROLL-NEXT: [[TMP12:%.*]] = phi <2 x i32> [ [[TMP7]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP11]], [[PRED_UDIV_IF3]] ] -; UNROLL-NEXT: br i1 [[C]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]] -; UNROLL: pred.udiv.if7: +; UNROLL-NEXT: br i1 [[C]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]] +; UNROLL: pred.udiv.if5: ; UNROLL-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i64 0 ; UNROLL-NEXT: [[TMP14:%.*]] = udiv i32 [[TMP13]], [[TMP0]] ; UNROLL-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP14]], i64 0 -; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE8]] -; UNROLL: pred.udiv.continue8: -; UNROLL-NEXT: [[TMP16:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP15]], [[PRED_UDIV_IF7]] ] -; UNROLL-NEXT: br i1 [[C]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10]] -; UNROLL: pred.udiv.if9: +; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE6]] +; UNROLL: pred.udiv.continue6: +; UNROLL-NEXT: [[TMP16:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP15]], [[PRED_UDIV_IF5]] ] +; UNROLL-NEXT: br i1 [[C]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8]] +; UNROLL: pred.udiv.if7: ; UNROLL-NEXT: [[TMP17:%.*]] = or i32 [[INDEX]], 3 ; UNROLL-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i64 1 ; UNROLL-NEXT: [[TMP19:%.*]] = udiv i32 [[TMP18]], [[TMP17]] ; UNROLL-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> [[TMP16]], i32 [[TMP19]], i64 1 -; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE10]] -; UNROLL: pred.udiv.continue10: -; UNROLL-NEXT: [[TMP21:%.*]] = phi <2 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP20]], [[PRED_UDIV_IF9]] ] -; UNROLL-NEXT: [[TMP22:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], -; UNROLL-NEXT: [[TMP23:%.*]] = shufflevector <2 x i1> [[TMP22]], <2 x i1> poison, <2 x i32> zeroinitializer -; UNROLL-NEXT: [[TMP24:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT5]], -; UNROLL-NEXT: [[TMP25:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> poison, <2 x i32> zeroinitializer -; UNROLL-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP23]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP12]] -; UNROLL-NEXT: [[PREDPHI11:%.*]] = select <2 x i1> [[TMP25]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP21]] -; UNROLL-NEXT: [[TMP26]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]] -; UNROLL-NEXT: [[TMP27]] = add <2 x i32> [[PREDPHI11]], [[VEC_PHI1]] +; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE8]] +; UNROLL: pred.udiv.continue8: +; UNROLL-NEXT: [[TMP21:%.*]] = phi <2 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP20]], [[PRED_UDIV_IF7]] ] +; UNROLL-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP12]], <2 x i32> [[WIDE_LOAD]] +; UNROLL-NEXT: [[PREDPHI9:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP21]], <2 x i32> [[WIDE_LOAD2]] +; UNROLL-NEXT: [[TMP22]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]] +; UNROLL-NEXT: [[TMP23]] = add <2 x i32> [[PREDPHI9]], [[VEC_PHI1]] ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; UNROLL-NEXT: [[TMP28:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; UNROLL-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; UNROLL: middle.block: -; UNROLL-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[TMP27]], [[TMP26]] -; UNROLL-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]]) +; UNROLL-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[TMP23]], [[TMP22]] +; UNROLL-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]]) ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SMAX]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: ; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP29]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; UNROLL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP25]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; UNROLL-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL: for.body: ; UNROLL-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ] ; UNROLL-NEXT: [[SUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[VAR4:%.*]], [[IF_END]] ] -; UNROLL-NEXT: [[TMP30:%.*]] = zext i32 [[I]] to i64 -; UNROLL-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP30]] +; UNROLL-NEXT: [[TMP26:%.*]] = zext i32 [[I]] to i64 +; UNROLL-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP26]] ; UNROLL-NEXT: [[VAR1:%.*]] = load i32, ptr [[VAR0]], align 4 ; UNROLL-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]] ; UNROLL: if.then: @@ -2203,7 +2187,7 @@ ; UNROLL-NEXT: [[COND:%.*]] = icmp slt i32 [[I_NEXT]], [[N]] ; UNROLL-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP25:![0-9]+]] ; UNROLL: for.end: -; UNROLL-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ] +; UNROLL-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] ; UNROLL-NEXT: ret i32 [[VAR5]] ; ; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_05( @@ -2216,13 +2200,11 @@ ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i32 [[SMAX]], [[N_MOD_VF]] ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0 ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT5]], <2 x i1> poison, <2 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: -; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE10:%.*]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[PRED_UDIV_CONTINUE10]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[PRED_UDIV_CONTINUE10]] ] +; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE8:%.*]] ] +; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[PRED_UDIV_CONTINUE8]] ] +; UNROLL-NO-IC-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[PRED_UDIV_CONTINUE8]] ] ; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 2 ; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] @@ -2250,31 +2232,31 @@ ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE4]] ; UNROLL-NO-IC: pred.udiv.continue4: ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = phi <2 x i32> [ [[TMP10]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], [[PRED_UDIV_IF3]] ] -; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT6]], i32 0 -; UNROLL-NO-IC-NEXT: br i1 [[TMP17]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]] -; UNROLL-NO-IC: pred.udiv.if7: +; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 +; UNROLL-NO-IC-NEXT: br i1 [[TMP17]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]] +; UNROLL-NO-IC: pred.udiv.if5: ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = udiv i32 [[TMP18]], [[TMP1]] ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i32 0 -; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE8]] -; UNROLL-NO-IC: pred.udiv.continue8: -; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP20]], [[PRED_UDIV_IF7]] ] -; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT6]], i32 1 -; UNROLL-NO-IC-NEXT: br i1 [[TMP22]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10]] -; UNROLL-NO-IC: pred.udiv.if9: +; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE6]] +; UNROLL-NO-IC: pred.udiv.continue6: +; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP20]], [[PRED_UDIV_IF5]] ] +; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 +; UNROLL-NO-IC-NEXT: br i1 [[TMP22]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8]] +; UNROLL-NO-IC: pred.udiv.if7: ; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = add i32 [[INDEX]], 3 ; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = udiv i32 [[TMP24]], [[TMP23]] ; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP25]], i32 1 -; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE10]] -; UNROLL-NO-IC: pred.udiv.continue10: -; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = phi <2 x i32> [ [[TMP21]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP26]], [[PRED_UDIV_IF9]] ] +; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE8]] +; UNROLL-NO-IC: pred.udiv.continue8: +; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = phi <2 x i32> [ [[TMP21]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP26]], [[PRED_UDIV_IF7]] ] ; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], -; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT6]], +; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], ; UNROLL-NO-IC-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP28]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP16]] -; UNROLL-NO-IC-NEXT: [[PREDPHI11:%.*]] = select <2 x i1> [[TMP29]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP27]] +; UNROLL-NO-IC-NEXT: [[PREDPHI9:%.*]] = select <2 x i1> [[TMP29]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP27]] ; UNROLL-NO-IC-NEXT: [[TMP30]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]] -; UNROLL-NO-IC-NEXT: [[TMP31]] = add <2 x i32> [[PREDPHI11]], [[VEC_PHI1]] +; UNROLL-NO-IC-NEXT: [[TMP31]] = add <2 x i32> [[PREDPHI9]], [[VEC_PHI1]] ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] @@ -2314,12 +2296,12 @@ ; INTERLEAVE: vector.ph: ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483640 ; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0 -; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0 +; INTERLEAVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: -; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE18:%.*]] ] -; INTERLEAVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_UDIV_CONTINUE18]] ] -; INTERLEAVE-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_UDIV_CONTINUE18]] ] +; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE16:%.*]] ] +; INTERLEAVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_UDIV_CONTINUE16]] ] +; INTERLEAVE-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_UDIV_CONTINUE16]] ] ; INTERLEAVE-NEXT: [[TMP0:%.*]] = or i32 [[INDEX]], 4 ; INTERLEAVE-NEXT: [[TMP1:%.*]] = sext i32 [[INDEX]] to i64 ; INTERLEAVE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]] @@ -2361,66 +2343,62 @@ ; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE8]] ; INTERLEAVE: pred.udiv.continue8: ; INTERLEAVE-NEXT: [[TMP22:%.*]] = phi <4 x i32> [ [[TMP17]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP21]], [[PRED_UDIV_IF7]] ] -; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]] -; INTERLEAVE: pred.udiv.if11: +; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]] +; INTERLEAVE: pred.udiv.if9: ; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 0 ; INTERLEAVE-NEXT: [[TMP24:%.*]] = udiv i32 [[TMP23]], [[TMP0]] ; INTERLEAVE-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP24]], i64 0 -; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE12]] -; INTERLEAVE: pred.udiv.continue12: -; INTERLEAVE-NEXT: [[TMP26:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE8]] ], [ [[TMP25]], [[PRED_UDIV_IF11]] ] -; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]] -; INTERLEAVE: pred.udiv.if13: +; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE10]] +; INTERLEAVE: pred.udiv.continue10: +; INTERLEAVE-NEXT: [[TMP26:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE8]] ], [ [[TMP25]], [[PRED_UDIV_IF9]] ] +; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]] +; INTERLEAVE: pred.udiv.if11: ; INTERLEAVE-NEXT: [[TMP27:%.*]] = or i32 [[INDEX]], 5 ; INTERLEAVE-NEXT: [[TMP28:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 1 ; INTERLEAVE-NEXT: [[TMP29:%.*]] = udiv i32 [[TMP28]], [[TMP27]] ; INTERLEAVE-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP29]], i64 1 -; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE14]] -; INTERLEAVE: pred.udiv.continue14: -; INTERLEAVE-NEXT: [[TMP31:%.*]] = phi <4 x i32> [ [[TMP26]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP30]], [[PRED_UDIV_IF13]] ] -; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]] -; INTERLEAVE: pred.udiv.if15: +; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE12]] +; INTERLEAVE: pred.udiv.continue12: +; INTERLEAVE-NEXT: [[TMP31:%.*]] = phi <4 x i32> [ [[TMP26]], [[PRED_UDIV_CONTINUE10]] ], [ [[TMP30]], [[PRED_UDIV_IF11]] ] +; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]] +; INTERLEAVE: pred.udiv.if13: ; INTERLEAVE-NEXT: [[TMP32:%.*]] = or i32 [[INDEX]], 6 ; INTERLEAVE-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 2 ; INTERLEAVE-NEXT: [[TMP34:%.*]] = udiv i32 [[TMP33]], [[TMP32]] ; INTERLEAVE-NEXT: [[TMP35:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP34]], i64 2 -; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE16]] -; INTERLEAVE: pred.udiv.continue16: -; INTERLEAVE-NEXT: [[TMP36:%.*]] = phi <4 x i32> [ [[TMP31]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP35]], [[PRED_UDIV_IF15]] ] -; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF17:%.*]], label [[PRED_UDIV_CONTINUE18]] -; INTERLEAVE: pred.udiv.if17: +; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE14]] +; INTERLEAVE: pred.udiv.continue14: +; INTERLEAVE-NEXT: [[TMP36:%.*]] = phi <4 x i32> [ [[TMP31]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP35]], [[PRED_UDIV_IF13]] ] +; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16]] +; INTERLEAVE: pred.udiv.if15: ; INTERLEAVE-NEXT: [[TMP37:%.*]] = or i32 [[INDEX]], 7 ; INTERLEAVE-NEXT: [[TMP38:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 3 ; INTERLEAVE-NEXT: [[TMP39:%.*]] = udiv i32 [[TMP38]], [[TMP37]] ; INTERLEAVE-NEXT: [[TMP40:%.*]] = insertelement <4 x i32> [[TMP36]], i32 [[TMP39]], i64 3 -; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE18]] -; INTERLEAVE: pred.udiv.continue18: -; INTERLEAVE-NEXT: [[TMP41:%.*]] = phi <4 x i32> [ [[TMP36]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP40]], [[PRED_UDIV_IF17]] ] -; INTERLEAVE-NEXT: [[TMP42:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT]], -; INTERLEAVE-NEXT: [[TMP43:%.*]] = shufflevector <4 x i1> [[TMP42]], <4 x i1> poison, <4 x i32> zeroinitializer -; INTERLEAVE-NEXT: [[TMP44:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT9]], -; INTERLEAVE-NEXT: [[TMP45:%.*]] = shufflevector <4 x i1> [[TMP44]], <4 x i1> poison, <4 x i32> zeroinitializer -; INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP43]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[TMP22]] -; INTERLEAVE-NEXT: [[PREDPHI19:%.*]] = select <4 x i1> [[TMP45]], <4 x i32> [[WIDE_LOAD2]], <4 x i32> [[TMP41]] -; INTERLEAVE-NEXT: [[TMP46]] = add <4 x i32> [[PREDPHI]], [[VEC_PHI]] -; INTERLEAVE-NEXT: [[TMP47]] = add <4 x i32> [[PREDPHI19]], [[VEC_PHI1]] +; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE16]] +; INTERLEAVE: pred.udiv.continue16: +; INTERLEAVE-NEXT: [[TMP41:%.*]] = phi <4 x i32> [ [[TMP36]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP40]], [[PRED_UDIV_IF15]] ] +; INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP22]], <4 x i32> [[WIDE_LOAD]] +; INTERLEAVE-NEXT: [[PREDPHI17:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP41]], <4 x i32> [[WIDE_LOAD2]] +; INTERLEAVE-NEXT: [[TMP42]] = add <4 x i32> [[PREDPHI]], [[VEC_PHI]] +; INTERLEAVE-NEXT: [[TMP43]] = add <4 x i32> [[PREDPHI17]], [[VEC_PHI1]] ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; INTERLEAVE-NEXT: [[TMP48:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP44:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; INTERLEAVE: middle.block: -; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP47]], [[TMP46]] -; INTERLEAVE-NEXT: [[TMP49:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP43]], [[TMP42]] +; INTERLEAVE-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SMAX]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: ; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; INTERLEAVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP49]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; INTERLEAVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP45]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; INTERLEAVE-NEXT: br label [[FOR_BODY:%.*]] ; INTERLEAVE: for.body: ; INTERLEAVE-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ] ; INTERLEAVE-NEXT: [[SUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[VAR4:%.*]], [[IF_END]] ] -; INTERLEAVE-NEXT: [[TMP50:%.*]] = zext i32 [[I]] to i64 -; INTERLEAVE-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP50]] +; INTERLEAVE-NEXT: [[TMP46:%.*]] = zext i32 [[I]] to i64 +; INTERLEAVE-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP46]] ; INTERLEAVE-NEXT: [[VAR1:%.*]] = load i32, ptr [[VAR0]], align 4 ; INTERLEAVE-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]] ; INTERLEAVE: if.then: @@ -2433,7 +2411,7 @@ ; INTERLEAVE-NEXT: [[COND:%.*]] = icmp slt i32 [[I_NEXT]], [[N]] ; INTERLEAVE-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP25:![0-9]+]] ; INTERLEAVE: for.end: -; INTERLEAVE-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP49]], [[MIDDLE_BLOCK]] ] +; INTERLEAVE-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP45]], [[MIDDLE_BLOCK]] ] ; INTERLEAVE-NEXT: ret i32 [[VAR5]] ; entry: @@ -2581,36 +2559,34 @@ ; UNROLL-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588 ; UNROLL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A:%.*]], i64 0 ; UNROLL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; UNROLL-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i64 0 ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], ; UNROLL-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 1 ; UNROLL-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 2 ; UNROLL-NEXT: [[TMP5:%.*]] = or i64 [[INDEX]], 3 ; UNROLL-NEXT: [[TMP6:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] -; UNROLL-NEXT: [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLATINSERT2]], -; UNROLL-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <2 x i32> zeroinitializer -; UNROLL-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[VEC_IND]] -; UNROLL-NEXT: [[TMP10:%.*]] = trunc <2 x i32> [[TMP6]] to <2 x i16> -; UNROLL-NEXT: [[TMP11:%.*]] = trunc <2 x i32> [[TMP9]] to <2 x i16> -; UNROLL-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1 -; UNROLL-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP3]], i32 1 -; UNROLL-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1 -; UNROLL-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP5]], i32 1 -; UNROLL-NEXT: [[TMP16:%.*]] = extractelement <2 x i16> [[TMP10]], i64 0 +; UNROLL-NEXT: [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]] +; UNROLL-NEXT: [[TMP8:%.*]] = trunc <2 x i32> [[TMP6]] to <2 x i16> +; UNROLL-NEXT: [[TMP9:%.*]] = trunc <2 x i32> [[TMP7]] to <2 x i16> +; UNROLL-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1 +; UNROLL-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP3]], i32 1 +; UNROLL-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1 +; UNROLL-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP5]], i32 1 +; UNROLL-NEXT: [[TMP14:%.*]] = extractelement <2 x i16> [[TMP8]], i64 0 +; UNROLL-NEXT: store i16 [[TMP14]], ptr [[TMP10]], align 2 +; UNROLL-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP8]], i64 1 +; UNROLL-NEXT: store i16 [[TMP15]], ptr [[TMP11]], align 2 +; UNROLL-NEXT: [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i64 0 ; UNROLL-NEXT: store i16 [[TMP16]], ptr [[TMP12]], align 2 -; UNROLL-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP10]], i64 1 +; UNROLL-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP9]], i64 1 ; UNROLL-NEXT: store i16 [[TMP17]], ptr [[TMP13]], align 2 -; UNROLL-NEXT: [[TMP18:%.*]] = extractelement <2 x i16> [[TMP11]], i64 0 -; UNROLL-NEXT: store i16 [[TMP18]], ptr [[TMP14]], align 2 -; UNROLL-NEXT: [[TMP19:%.*]] = extractelement <2 x i16> [[TMP11]], i64 1 -; UNROLL-NEXT: store i16 [[TMP19]], ptr [[TMP15]], align 2 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; UNROLL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -2619,14 +2595,14 @@ ; UNROLL-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL: for.body: ; UNROLL-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; UNROLL-NEXT: [[TMP21:%.*]] = trunc i64 [[I]] to i32 -; UNROLL-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[A]] -; UNROLL-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 -; UNROLL-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[I]], i32 1 -; UNROLL-NEXT: store i16 [[TMP23]], ptr [[TMP24]], align 2 +; UNROLL-NEXT: [[TMP19:%.*]] = trunc i64 [[I]] to i32 +; UNROLL-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], [[A]] +; UNROLL-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 +; UNROLL-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[I]], i32 1 +; UNROLL-NEXT: store i16 [[TMP21]], ptr [[TMP22]], align 2 ; UNROLL-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 -; UNROLL-NEXT: [[TMP25:%.*]] = trunc i64 [[I_NEXT]] to i32 -; UNROLL-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP25]], [[N]] +; UNROLL-NEXT: [[TMP23:%.*]] = trunc i64 [[I_NEXT]] to i32 +; UNROLL-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP23]], [[N]] ; UNROLL-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; UNROLL: for.end: ; UNROLL-NEXT: ret void @@ -2643,8 +2619,6 @@ ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A:%.*]], i64 0 ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i64 0 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT2]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -2655,7 +2629,7 @@ ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 2 ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 3 ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = add <2 x i32> [[BROADCAST_SPLAT3]], [[STEP_ADD]] +; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]] ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = trunc <2 x i32> [[TMP7]] to <2 x i16> ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = trunc <2 x i32> [[TMP8]] to <2 x i16> ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[TMP3]], i32 1 @@ -2705,11 +2679,11 @@ ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934584 ; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i64 0 ; INTERLEAVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], ; INTERLEAVE-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 1 ; INTERLEAVE-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 2 ; INTERLEAVE-NEXT: [[TMP5:%.*]] = or i64 [[INDEX]], 3 @@ -2718,39 +2692,37 @@ ; INTERLEAVE-NEXT: [[TMP8:%.*]] = or i64 [[INDEX]], 6 ; INTERLEAVE-NEXT: [[TMP9:%.*]] = or i64 [[INDEX]], 7 ; INTERLEAVE-NEXT: [[TMP10:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] -; INTERLEAVE-NEXT: [[TMP11:%.*]] = add <4 x i32> [[BROADCAST_SPLATINSERT2]], -; INTERLEAVE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> zeroinitializer -; INTERLEAVE-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[VEC_IND]] -; INTERLEAVE-NEXT: [[TMP14:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i16> -; INTERLEAVE-NEXT: [[TMP15:%.*]] = trunc <4 x i32> [[TMP13]] to <4 x i16> -; INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1 -; INTERLEAVE-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP3]], i32 1 -; INTERLEAVE-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1 -; INTERLEAVE-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP5]], i32 1 -; INTERLEAVE-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP6]], i32 1 -; INTERLEAVE-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP7]], i32 1 -; INTERLEAVE-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP8]], i32 1 -; INTERLEAVE-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP9]], i32 1 -; INTERLEAVE-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[TMP14]], i64 0 +; INTERLEAVE-NEXT: [[TMP11:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]] +; INTERLEAVE-NEXT: [[TMP12:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i16> +; INTERLEAVE-NEXT: [[TMP13:%.*]] = trunc <4 x i32> [[TMP11]] to <4 x i16> +; INTERLEAVE-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1 +; INTERLEAVE-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP3]], i32 1 +; INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1 +; INTERLEAVE-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP5]], i32 1 +; INTERLEAVE-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP6]], i32 1 +; INTERLEAVE-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP7]], i32 1 +; INTERLEAVE-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP8]], i32 1 +; INTERLEAVE-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP9]], i32 1 +; INTERLEAVE-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[TMP12]], i64 0 +; INTERLEAVE-NEXT: store i16 [[TMP22]], ptr [[TMP14]], align 2 +; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[TMP12]], i64 1 +; INTERLEAVE-NEXT: store i16 [[TMP23]], ptr [[TMP15]], align 2 +; INTERLEAVE-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[TMP12]], i64 2 ; INTERLEAVE-NEXT: store i16 [[TMP24]], ptr [[TMP16]], align 2 -; INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[TMP14]], i64 1 +; INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[TMP12]], i64 3 ; INTERLEAVE-NEXT: store i16 [[TMP25]], ptr [[TMP17]], align 2 -; INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[TMP14]], i64 2 +; INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[TMP13]], i64 0 ; INTERLEAVE-NEXT: store i16 [[TMP26]], ptr [[TMP18]], align 2 -; INTERLEAVE-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[TMP14]], i64 3 +; INTERLEAVE-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[TMP13]], i64 1 ; INTERLEAVE-NEXT: store i16 [[TMP27]], ptr [[TMP19]], align 2 -; INTERLEAVE-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[TMP15]], i64 0 +; INTERLEAVE-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[TMP13]], i64 2 ; INTERLEAVE-NEXT: store i16 [[TMP28]], ptr [[TMP20]], align 2 -; INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement <4 x i16> [[TMP15]], i64 1 +; INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement <4 x i16> [[TMP13]], i64 3 ; INTERLEAVE-NEXT: store i16 [[TMP29]], ptr [[TMP21]], align 2 -; INTERLEAVE-NEXT: [[TMP30:%.*]] = extractelement <4 x i16> [[TMP15]], i64 2 -; INTERLEAVE-NEXT: store i16 [[TMP30]], ptr [[TMP22]], align 2 -; INTERLEAVE-NEXT: [[TMP31:%.*]] = extractelement <4 x i16> [[TMP15]], i64 3 -; INTERLEAVE-NEXT: store i16 [[TMP31]], ptr [[TMP23]], align 2 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; INTERLEAVE: middle.block: ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -2759,14 +2731,14 @@ ; INTERLEAVE-NEXT: br label [[FOR_BODY:%.*]] ; INTERLEAVE: for.body: ; INTERLEAVE-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; INTERLEAVE-NEXT: [[TMP33:%.*]] = trunc i64 [[I]] to i32 -; INTERLEAVE-NEXT: [[TMP34:%.*]] = add i32 [[TMP33]], [[A]] -; INTERLEAVE-NEXT: [[TMP35:%.*]] = trunc i32 [[TMP34]] to i16 -; INTERLEAVE-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[I]], i32 1 -; INTERLEAVE-NEXT: store i16 [[TMP35]], ptr [[TMP36]], align 2 +; INTERLEAVE-NEXT: [[TMP31:%.*]] = trunc i64 [[I]] to i32 +; INTERLEAVE-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], [[A]] +; INTERLEAVE-NEXT: [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16 +; INTERLEAVE-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[I]], i32 1 +; INTERLEAVE-NEXT: store i16 [[TMP33]], ptr [[TMP34]], align 2 ; INTERLEAVE-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 -; INTERLEAVE-NEXT: [[TMP37:%.*]] = trunc i64 [[I_NEXT]] to i32 -; INTERLEAVE-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP37]], [[N]] +; INTERLEAVE-NEXT: [[TMP35:%.*]] = trunc i64 [[I_NEXT]] to i32 +; INTERLEAVE-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP35]], [[N]] ; INTERLEAVE-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; INTERLEAVE: for.end: ; INTERLEAVE-NEXT: ret void @@ -3328,7 +3300,7 @@ ; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[DOTCAST]] ; UNROLL-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> , i32 [[C_PROMOTED_I]], i64 0 ; UNROLL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 -; UNROLL-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 +; UNROLL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -3336,15 +3308,13 @@ ; UNROLL-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] ; UNROLL: middle.block: -; UNROLL-NEXT: [[TMP6:%.*]] = and <2 x i32> [[BROADCAST_SPLATINSERT2]], [[BROADCAST_SPLATINSERT]] -; UNROLL-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> zeroinitializer -; UNROLL-NEXT: [[BIN_RDX:%.*]] = and <2 x i32> [[TMP7]], [[TMP4]] -; UNROLL-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]]) +; UNROLL-NEXT: [[TMP6:%.*]] = and <2 x i32> [[TMP4]], [[BROADCAST_SPLAT]] +; UNROLL-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP6]]) ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP3]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: ; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DOTPR_I]], [[ENTRY:%.*]] ] -; UNROLL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[C_PROMOTED_I]], [[ENTRY]] ] +; UNROLL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[C_PROMOTED_I]], [[ENTRY]] ] ; UNROLL-NEXT: br label [[COND_END_I:%.*]] ; UNROLL: cond.end.i: ; UNROLL-NEXT: [[INC4_I:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC_I:%.*]], [[COND_END_I]] ] @@ -3353,7 +3323,7 @@ ; UNROLL-NEXT: [[TOBOOL_I:%.*]] = icmp eq i8 [[INC_I]], 0 ; UNROLL-NEXT: br i1 [[TOBOOL_I]], label [[LOOPEXIT]], label [[COND_END_I]], !llvm.loop [[LOOP35:![0-9]+]] ; UNROLL: loopexit: -; UNROLL-NEXT: [[AND_I_LCSSA:%.*]] = phi i32 [ [[AND_I]], [[COND_END_I]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; UNROLL-NEXT: [[AND_I_LCSSA:%.*]] = phi i32 [ [[AND_I]], [[COND_END_I]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; UNROLL-NEXT: ret i32 [[AND_I_LCSSA]] ; ; UNROLL-NO-IC-LABEL: @testoverflowcheck( @@ -3374,15 +3344,13 @@ ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> , i32 [[C_PROMOTED_I]], i32 0 ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT2]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[TMP5]] = and <2 x i32> [[BROADCAST_SPLAT]], [[VEC_PHI]] -; UNROLL-NO-IC-NEXT: [[TMP6]] = and <2 x i32> [[BROADCAST_SPLAT3]], [[VEC_PHI1]] +; UNROLL-NO-IC-NEXT: [[TMP6]] = and <2 x i32> [[BROADCAST_SPLAT]], [[VEC_PHI1]] ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] @@ -3422,7 +3390,7 @@ ; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[DOTCAST]] ; INTERLEAVE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[C_PROMOTED_I]], i64 0 ; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 -; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 +; INTERLEAVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -3430,15 +3398,13 @@ ; INTERLEAVE-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] ; INTERLEAVE: middle.block: -; INTERLEAVE-NEXT: [[TMP6:%.*]] = and <4 x i32> [[BROADCAST_SPLATINSERT2]], [[BROADCAST_SPLATINSERT]] -; INTERLEAVE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer -; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = and <4 x i32> [[TMP7]], [[TMP4]] -; INTERLEAVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[BIN_RDX]]) +; INTERLEAVE-NEXT: [[TMP6:%.*]] = and <4 x i32> [[TMP4]], [[BROADCAST_SPLAT]] +; INTERLEAVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP6]]) ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP3]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: ; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DOTPR_I]], [[ENTRY:%.*]] ] -; INTERLEAVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[C_PROMOTED_I]], [[ENTRY]] ] +; INTERLEAVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[C_PROMOTED_I]], [[ENTRY]] ] ; INTERLEAVE-NEXT: br label [[COND_END_I:%.*]] ; INTERLEAVE: cond.end.i: ; INTERLEAVE-NEXT: [[INC4_I:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC_I:%.*]], [[COND_END_I]] ] @@ -3447,7 +3413,7 @@ ; INTERLEAVE-NEXT: [[TOBOOL_I:%.*]] = icmp eq i8 [[INC_I]], 0 ; INTERLEAVE-NEXT: br i1 [[TOBOOL_I]], label [[LOOPEXIT]], label [[COND_END_I]], !llvm.loop [[LOOP35:![0-9]+]] ; INTERLEAVE: loopexit: -; INTERLEAVE-NEXT: [[AND_I_LCSSA:%.*]] = phi i32 [ [[AND_I]], [[COND_END_I]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; INTERLEAVE-NEXT: [[AND_I_LCSSA:%.*]] = phi i32 [ [[AND_I]], [[COND_END_I]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; INTERLEAVE-NEXT: ret i32 [[AND_I_LCSSA]] ; entry: @@ -6093,10 +6059,10 @@ ; UNROLL-NEXT: [[TMP0:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> ; UNROLL-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[VEC_IND]], <2 x i32> [[STEP_ADD]], <2 x i32> ; UNROLL-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC:%.*]], align 4 -; UNROLL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0 -; UNROLL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer -; UNROLL-NEXT: [[TMP3:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT4]], [[TMP0]] -; UNROLL-NEXT: [[TMP4:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT4]], [[TMP1]] +; UNROLL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0 +; UNROLL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; UNROLL-NEXT: [[TMP3:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT]], [[TMP0]] +; UNROLL-NEXT: [[TMP4:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT]], [[TMP1]] ; UNROLL-NEXT: [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32 ; UNROLL-NEXT: [[TMP5:%.*]] = ashr exact i64 [[SEXT]], 32 ; UNROLL-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP5]] @@ -6134,10 +6100,10 @@ ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[VEC_IND]], <2 x i32> [[STEP_ADD]], <2 x i32> ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = load i32, ptr [[SRC:%.*]], align 4 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i64 0 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer -; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT4]], [[TMP3]] -; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT4]], [[TMP4]] +; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i64 0 +; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT]], [[TMP3]] +; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT]], [[TMP4]] ; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 [[TMP1]] ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP2]] ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP6]] @@ -6189,10 +6155,10 @@ ; INTERLEAVE-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> ; INTERLEAVE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> ; INTERLEAVE-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC:%.*]], align 4 -; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i64 0 -; INTERLEAVE-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer -; INTERLEAVE-NEXT: [[TMP3:%.*]] = mul nsw <4 x i32> [[BROADCAST_SPLAT4]], [[TMP0]] -; INTERLEAVE-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[BROADCAST_SPLAT4]], [[TMP1]] +; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i64 0 +; INTERLEAVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; INTERLEAVE-NEXT: [[TMP3:%.*]] = mul nsw <4 x i32> [[BROADCAST_SPLAT]], [[TMP0]] +; INTERLEAVE-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[BROADCAST_SPLAT]], [[TMP1]] ; INTERLEAVE-NEXT: [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32 ; INTERLEAVE-NEXT: [[TMP5:%.*]] = ashr exact i64 [[SEXT]], 32 ; INTERLEAVE-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP5]] @@ -6222,8 +6188,8 @@ ; INTERLEAVE-NEXT: [[TRUNC_IV_NEXT]] = add i32 [[TRUNC_IV]], 1 ; INTERLEAVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; INTERLEAVE-NEXT: [[IV_TRUNC]] = trunc i64 [[IV]] to i32 -; INTERLEAVE-NEXT: [[SEXT5:%.*]] = shl i64 [[IV]], 32 -; INTERLEAVE-NEXT: [[TMP11:%.*]] = ashr exact i64 [[SEXT5]], 32 +; INTERLEAVE-NEXT: [[SEXT3:%.*]] = shl i64 [[IV]], 32 +; INTERLEAVE-NEXT: [[TMP11:%.*]] = ashr exact i64 [[SEXT3]], 32 ; INTERLEAVE-NEXT: [[DST_GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP11]] ; INTERLEAVE-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[IV_TRUNC]] ; INTERLEAVE-NEXT: store i32 [[ADD]], ptr [[DST_GEP]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll @@ -11,38 +11,38 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 8 -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 12 -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]]) -; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD4]]) -; CHECK-NEXT: [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI1]] -; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD5]]) -; CHECK-NEXT: [[TMP13]] = add i32 [[TMP12]], [[VEC_PHI2]] -; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD6]]) -; CHECK-NEXT: [[TMP15]] = add i32 [[TMP14]], [[VEC_PHI3]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 8 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 12 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD4]]) +; CHECK-NEXT: [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD5]]) +; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI2]] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD6]]) +; CHECK-NEXT: [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI3]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP11]], [[TMP9]] -; CHECK-NEXT: [[BIN_RDX7:%.*]] = add i32 [[TMP13]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX8:%.*]] = add i32 [[TMP15]], [[BIN_RDX7]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP7]], [[TMP5]] +; CHECK-NEXT: [[BIN_RDX7:%.*]] = add i32 [[TMP9]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX8:%.*]] = add i32 [[TMP11]], [[BIN_RDX7]] ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: -; CHECK-NEXT: br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: ._crit_edge: ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ poison, [[DOTLR_PH]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] @@ -304,17 +304,15 @@ ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> poison, i32 [[COND:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i32> poison, i32 [[COND]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x i32> poison, i32 [[COND]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <4 x i32> poison, i32 [[COND]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT7]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE44:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE44]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 4, [[VECTOR_PH]] ], [ [[TMP113:%.*]], [[PRED_LOAD_CONTINUE44]] ] -; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP116:%.*]], [[PRED_LOAD_CONTINUE44]] ] -; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP119:%.*]], [[PRED_LOAD_CONTINUE44]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP122:%.*]], [[PRED_LOAD_CONTINUE44]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE38:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE38]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 4, [[VECTOR_PH]] ], [ [[TMP109:%.*]], [[PRED_LOAD_CONTINUE38]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP112:%.*]], [[PRED_LOAD_CONTINUE38]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP115:%.*]], [[PRED_LOAD_CONTINUE38]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP118:%.*]], [[PRED_LOAD_CONTINUE38]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[VEC_IND]], @@ -322,197 +320,193 @@ ; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <4 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ule <4 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLATINSERT7]], -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP4]], <4 x i1> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLATINSERT9]], -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLATINSERT11]], -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i1> [[TMP8]], <4 x i1> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLATINSERT13]], -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP0]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP7]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP3]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP12]], i64 0 -; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLAT8]], +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLAT8]], +; CHECK-NEXT: [[TMP6:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLAT8]], +; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLAT8]], +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP0]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP6]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP3]], <4 x i1> [[TMP7]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP8]], i64 0 +; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> poison, i32 [[TMP14]], i64 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP20:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP19]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP12]], i64 1 -; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] +; CHECK-NEXT: [[TMP16:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP15]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP8]], i64 1 +; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; CHECK: pred.load.if9: +; CHECK-NEXT: [[TMP18:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP20]], i64 1 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; CHECK: pred.load.continue10: +; CHECK-NEXT: [[TMP22:%.*]] = phi <4 x i32> [ [[TMP16]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP21]], [[PRED_LOAD_IF9]] ] +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP8]], i64 2 +; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; CHECK: pred.load.if11: +; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP26]], i64 2 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; CHECK: pred.load.continue12: +; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ [[TMP22]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP27]], [[PRED_LOAD_IF11]] ] +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP8]], i64 3 +; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] +; CHECK: pred.load.if13: +; CHECK-NEXT: [[TMP30:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP32]], i64 3 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; CHECK: pred.load.continue14: +; CHECK-NEXT: [[TMP34:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP33]], [[PRED_LOAD_IF13]] ] +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[TMP9]], i64 0 +; CHECK-NEXT: br i1 [[TMP35]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] ; CHECK: pred.load.if15: -; CHECK-NEXT: [[TMP22:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP24]], i64 1 +; CHECK-NEXT: [[TMP36:%.*]] = or i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP36]] +; CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i32> poison, i32 [[TMP38]], i64 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE16]] ; CHECK: pred.load.continue16: -; CHECK-NEXT: [[TMP26:%.*]] = phi <4 x i32> [ [[TMP20]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF15]] ] -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[TMP12]], i64 2 -; CHECK-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] +; CHECK-NEXT: [[TMP40:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE14]] ], [ [[TMP39]], [[PRED_LOAD_IF15]] ] +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i1> [[TMP9]], i64 1 +; CHECK-NEXT: br i1 [[TMP41]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] ; CHECK: pred.load.if17: -; CHECK-NEXT: [[TMP28:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP28]] -; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP30]], i64 2 +; CHECK-NEXT: [[TMP42:%.*]] = or i64 [[INDEX]], 5 +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP42]] +; CHECK-NEXT: [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i32> [[TMP40]], i32 [[TMP44]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE18]] ; CHECK: pred.load.continue18: -; CHECK-NEXT: [[TMP32:%.*]] = phi <4 x i32> [ [[TMP26]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP31]], [[PRED_LOAD_IF17]] ] -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i1> [[TMP12]], i64 3 -; CHECK-NEXT: br i1 [[TMP33]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] +; CHECK-NEXT: [[TMP46:%.*]] = phi <4 x i32> [ [[TMP40]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP45]], [[PRED_LOAD_IF17]] ] +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i1> [[TMP9]], i64 2 +; CHECK-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] ; CHECK: pred.load.if19: -; CHECK-NEXT: [[TMP34:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP34]] -; CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP36]], i64 3 +; CHECK-NEXT: [[TMP48:%.*]] = or i64 [[INDEX]], 6 +; CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP48]] +; CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr [[TMP49]], align 4 +; CHECK-NEXT: [[TMP51:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[TMP50]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE20]] ; CHECK: pred.load.continue20: -; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP32]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP37]], [[PRED_LOAD_IF19]] ] -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i1> [[TMP13]], i64 0 -; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] +; CHECK-NEXT: [[TMP52:%.*]] = phi <4 x i32> [ [[TMP46]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP51]], [[PRED_LOAD_IF19]] ] +; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i1> [[TMP9]], i64 3 +; CHECK-NEXT: br i1 [[TMP53]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] ; CHECK: pred.load.if21: -; CHECK-NEXT: [[TMP40:%.*]] = or i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP40]] -; CHECK-NEXT: [[TMP42:%.*]] = load i32, ptr [[TMP41]], align 4 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x i32> poison, i32 [[TMP42]], i64 0 +; CHECK-NEXT: [[TMP54:%.*]] = or i64 [[INDEX]], 7 +; CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP54]] +; CHECK-NEXT: [[TMP56:%.*]] = load i32, ptr [[TMP55]], align 4 +; CHECK-NEXT: [[TMP57:%.*]] = insertelement <4 x i32> [[TMP52]], i32 [[TMP56]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE22]] ; CHECK: pred.load.continue22: -; CHECK-NEXT: [[TMP44:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE20]] ], [ [[TMP43]], [[PRED_LOAD_IF21]] ] -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i1> [[TMP13]], i64 1 -; CHECK-NEXT: br i1 [[TMP45]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] +; CHECK-NEXT: [[TMP58:%.*]] = phi <4 x i32> [ [[TMP52]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP57]], [[PRED_LOAD_IF21]] ] +; CHECK-NEXT: [[TMP59:%.*]] = extractelement <4 x i1> [[TMP10]], i64 0 +; CHECK-NEXT: br i1 [[TMP59]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] ; CHECK: pred.load.if23: -; CHECK-NEXT: [[TMP46:%.*]] = or i64 [[INDEX]], 5 -; CHECK-NEXT: [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP46]] -; CHECK-NEXT: [[TMP48:%.*]] = load i32, ptr [[TMP47]], align 4 -; CHECK-NEXT: [[TMP49:%.*]] = insertelement <4 x i32> [[TMP44]], i32 [[TMP48]], i64 1 +; CHECK-NEXT: [[TMP60:%.*]] = or i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP60]] +; CHECK-NEXT: [[TMP62:%.*]] = load i32, ptr [[TMP61]], align 4 +; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i32> poison, i32 [[TMP62]], i64 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE24]] ; CHECK: pred.load.continue24: -; CHECK-NEXT: [[TMP50:%.*]] = phi <4 x i32> [ [[TMP44]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP49]], [[PRED_LOAD_IF23]] ] -; CHECK-NEXT: [[TMP51:%.*]] = extractelement <4 x i1> [[TMP13]], i64 2 -; CHECK-NEXT: br i1 [[TMP51]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] +; CHECK-NEXT: [[TMP64:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE22]] ], [ [[TMP63]], [[PRED_LOAD_IF23]] ] +; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i1> [[TMP10]], i64 1 +; CHECK-NEXT: br i1 [[TMP65]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] ; CHECK: pred.load.if25: -; CHECK-NEXT: [[TMP52:%.*]] = or i64 [[INDEX]], 6 -; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP52]] -; CHECK-NEXT: [[TMP54:%.*]] = load i32, ptr [[TMP53]], align 4 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i32> [[TMP50]], i32 [[TMP54]], i64 2 +; CHECK-NEXT: [[TMP66:%.*]] = or i64 [[INDEX]], 9 +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP66]] +; CHECK-NEXT: [[TMP68:%.*]] = load i32, ptr [[TMP67]], align 4 +; CHECK-NEXT: [[TMP69:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP68]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE26]] ; CHECK: pred.load.continue26: -; CHECK-NEXT: [[TMP56:%.*]] = phi <4 x i32> [ [[TMP50]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP55]], [[PRED_LOAD_IF25]] ] -; CHECK-NEXT: [[TMP57:%.*]] = extractelement <4 x i1> [[TMP13]], i64 3 -; CHECK-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] +; CHECK-NEXT: [[TMP70:%.*]] = phi <4 x i32> [ [[TMP64]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP69]], [[PRED_LOAD_IF25]] ] +; CHECK-NEXT: [[TMP71:%.*]] = extractelement <4 x i1> [[TMP10]], i64 2 +; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] ; CHECK: pred.load.if27: -; CHECK-NEXT: [[TMP58:%.*]] = or i64 [[INDEX]], 7 -; CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP58]] -; CHECK-NEXT: [[TMP60:%.*]] = load i32, ptr [[TMP59]], align 4 -; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i32> [[TMP56]], i32 [[TMP60]], i64 3 +; CHECK-NEXT: [[TMP72:%.*]] = or i64 [[INDEX]], 10 +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP72]] +; CHECK-NEXT: [[TMP74:%.*]] = load i32, ptr [[TMP73]], align 4 +; CHECK-NEXT: [[TMP75:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP74]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE28]] ; CHECK: pred.load.continue28: -; CHECK-NEXT: [[TMP62:%.*]] = phi <4 x i32> [ [[TMP56]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP61]], [[PRED_LOAD_IF27]] ] -; CHECK-NEXT: [[TMP63:%.*]] = extractelement <4 x i1> [[TMP14]], i64 0 -; CHECK-NEXT: br i1 [[TMP63]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK-NEXT: [[TMP76:%.*]] = phi <4 x i32> [ [[TMP70]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP75]], [[PRED_LOAD_IF27]] ] +; CHECK-NEXT: [[TMP77:%.*]] = extractelement <4 x i1> [[TMP10]], i64 3 +; CHECK-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] ; CHECK: pred.load.if29: -; CHECK-NEXT: [[TMP64:%.*]] = or i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP64]] -; CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[TMP65]], align 4 -; CHECK-NEXT: [[TMP67:%.*]] = insertelement <4 x i32> poison, i32 [[TMP66]], i64 0 +; CHECK-NEXT: [[TMP78:%.*]] = or i64 [[INDEX]], 11 +; CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP78]] +; CHECK-NEXT: [[TMP80:%.*]] = load i32, ptr [[TMP79]], align 4 +; CHECK-NEXT: [[TMP81:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP80]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE30]] ; CHECK: pred.load.continue30: -; CHECK-NEXT: [[TMP68:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE28]] ], [ [[TMP67]], [[PRED_LOAD_IF29]] ] -; CHECK-NEXT: [[TMP69:%.*]] = extractelement <4 x i1> [[TMP14]], i64 1 -; CHECK-NEXT: br i1 [[TMP69]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] +; CHECK-NEXT: [[TMP82:%.*]] = phi <4 x i32> [ [[TMP76]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP81]], [[PRED_LOAD_IF29]] ] +; CHECK-NEXT: [[TMP83:%.*]] = extractelement <4 x i1> [[TMP11]], i64 0 +; CHECK-NEXT: br i1 [[TMP83]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] ; CHECK: pred.load.if31: -; CHECK-NEXT: [[TMP70:%.*]] = or i64 [[INDEX]], 9 -; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP70]] -; CHECK-NEXT: [[TMP72:%.*]] = load i32, ptr [[TMP71]], align 4 -; CHECK-NEXT: [[TMP73:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP72]], i64 1 +; CHECK-NEXT: [[TMP84:%.*]] = or i64 [[INDEX]], 12 +; CHECK-NEXT: [[TMP85:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP84]] +; CHECK-NEXT: [[TMP86:%.*]] = load i32, ptr [[TMP85]], align 4 +; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> poison, i32 [[TMP86]], i64 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE32]] ; CHECK: pred.load.continue32: -; CHECK-NEXT: [[TMP74:%.*]] = phi <4 x i32> [ [[TMP68]], [[PRED_LOAD_CONTINUE30]] ], [ [[TMP73]], [[PRED_LOAD_IF31]] ] -; CHECK-NEXT: [[TMP75:%.*]] = extractelement <4 x i1> [[TMP14]], i64 2 -; CHECK-NEXT: br i1 [[TMP75]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] +; CHECK-NEXT: [[TMP88:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP87]], [[PRED_LOAD_IF31]] ] +; CHECK-NEXT: [[TMP89:%.*]] = extractelement <4 x i1> [[TMP11]], i64 1 +; CHECK-NEXT: br i1 [[TMP89]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] ; CHECK: pred.load.if33: -; CHECK-NEXT: [[TMP76:%.*]] = or i64 [[INDEX]], 10 -; CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP76]] -; CHECK-NEXT: [[TMP78:%.*]] = load i32, ptr [[TMP77]], align 4 -; CHECK-NEXT: [[TMP79:%.*]] = insertelement <4 x i32> [[TMP74]], i32 [[TMP78]], i64 2 +; CHECK-NEXT: [[TMP90:%.*]] = or i64 [[INDEX]], 13 +; CHECK-NEXT: [[TMP91:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP90]] +; CHECK-NEXT: [[TMP92:%.*]] = load i32, ptr [[TMP91]], align 4 +; CHECK-NEXT: [[TMP93:%.*]] = insertelement <4 x i32> [[TMP88]], i32 [[TMP92]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE34]] ; CHECK: pred.load.continue34: -; CHECK-NEXT: [[TMP80:%.*]] = phi <4 x i32> [ [[TMP74]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP79]], [[PRED_LOAD_IF33]] ] -; CHECK-NEXT: [[TMP81:%.*]] = extractelement <4 x i1> [[TMP14]], i64 3 -; CHECK-NEXT: br i1 [[TMP81]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] +; CHECK-NEXT: [[TMP94:%.*]] = phi <4 x i32> [ [[TMP88]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP93]], [[PRED_LOAD_IF33]] ] +; CHECK-NEXT: [[TMP95:%.*]] = extractelement <4 x i1> [[TMP11]], i64 2 +; CHECK-NEXT: br i1 [[TMP95]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] ; CHECK: pred.load.if35: -; CHECK-NEXT: [[TMP82:%.*]] = or i64 [[INDEX]], 11 -; CHECK-NEXT: [[TMP83:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP82]] -; CHECK-NEXT: [[TMP84:%.*]] = load i32, ptr [[TMP83]], align 4 -; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP80]], i32 [[TMP84]], i64 3 +; CHECK-NEXT: [[TMP96:%.*]] = or i64 [[INDEX]], 14 +; CHECK-NEXT: [[TMP97:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP96]] +; CHECK-NEXT: [[TMP98:%.*]] = load i32, ptr [[TMP97]], align 4 +; CHECK-NEXT: [[TMP99:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP98]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE36]] ; CHECK: pred.load.continue36: -; CHECK-NEXT: [[TMP86:%.*]] = phi <4 x i32> [ [[TMP80]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP85]], [[PRED_LOAD_IF35]] ] -; CHECK-NEXT: [[TMP87:%.*]] = extractelement <4 x i1> [[TMP15]], i64 0 -; CHECK-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]] +; CHECK-NEXT: [[TMP100:%.*]] = phi <4 x i32> [ [[TMP94]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP99]], [[PRED_LOAD_IF35]] ] +; CHECK-NEXT: [[TMP101:%.*]] = extractelement <4 x i1> [[TMP11]], i64 3 +; CHECK-NEXT: br i1 [[TMP101]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38]] ; CHECK: pred.load.if37: -; CHECK-NEXT: [[TMP88:%.*]] = or i64 [[INDEX]], 12 -; CHECK-NEXT: [[TMP89:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP88]] -; CHECK-NEXT: [[TMP90:%.*]] = load i32, ptr [[TMP89]], align 4 -; CHECK-NEXT: [[TMP91:%.*]] = insertelement <4 x i32> poison, i32 [[TMP90]], i64 0 +; CHECK-NEXT: [[TMP102:%.*]] = or i64 [[INDEX]], 15 +; CHECK-NEXT: [[TMP103:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP102]] +; CHECK-NEXT: [[TMP104:%.*]] = load i32, ptr [[TMP103]], align 4 +; CHECK-NEXT: [[TMP105:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP104]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE38]] ; CHECK: pred.load.continue38: -; CHECK-NEXT: [[TMP92:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE36]] ], [ [[TMP91]], [[PRED_LOAD_IF37]] ] -; CHECK-NEXT: [[TMP93:%.*]] = extractelement <4 x i1> [[TMP15]], i64 1 -; CHECK-NEXT: br i1 [[TMP93]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]] -; CHECK: pred.load.if39: -; CHECK-NEXT: [[TMP94:%.*]] = or i64 [[INDEX]], 13 -; CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP94]] -; CHECK-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP95]], align 4 -; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP96]], i64 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE40]] -; CHECK: pred.load.continue40: -; CHECK-NEXT: [[TMP98:%.*]] = phi <4 x i32> [ [[TMP92]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP97]], [[PRED_LOAD_IF39]] ] -; CHECK-NEXT: [[TMP99:%.*]] = extractelement <4 x i1> [[TMP15]], i64 2 -; CHECK-NEXT: br i1 [[TMP99]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]] -; CHECK: pred.load.if41: -; CHECK-NEXT: [[TMP100:%.*]] = or i64 [[INDEX]], 14 -; CHECK-NEXT: [[TMP101:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP100]] -; CHECK-NEXT: [[TMP102:%.*]] = load i32, ptr [[TMP101]], align 4 -; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP102]], i64 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE42]] -; CHECK: pred.load.continue42: -; CHECK-NEXT: [[TMP104:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP103]], [[PRED_LOAD_IF41]] ] -; CHECK-NEXT: [[TMP105:%.*]] = extractelement <4 x i1> [[TMP15]], i64 3 -; CHECK-NEXT: br i1 [[TMP105]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44]] -; CHECK: pred.load.if43: -; CHECK-NEXT: [[TMP106:%.*]] = or i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP107:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP106]] -; CHECK-NEXT: [[TMP108:%.*]] = load i32, ptr [[TMP107]], align 4 -; CHECK-NEXT: [[TMP109:%.*]] = insertelement <4 x i32> [[TMP104]], i32 [[TMP108]], i64 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE44]] -; CHECK: pred.load.continue44: -; CHECK-NEXT: [[TMP110:%.*]] = phi <4 x i32> [ [[TMP104]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP109]], [[PRED_LOAD_IF43]] ] -; CHECK-NEXT: [[TMP111:%.*]] = select <4 x i1> [[TMP12]], <4 x i32> [[TMP38]], <4 x i32> -; CHECK-NEXT: [[TMP112:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP111]]) -; CHECK-NEXT: [[TMP113]] = mul i32 [[TMP112]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP114:%.*]] = select <4 x i1> [[TMP13]], <4 x i32> [[TMP62]], <4 x i32> -; CHECK-NEXT: [[TMP115:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP114]]) -; CHECK-NEXT: [[TMP116]] = mul i32 [[TMP115]], [[VEC_PHI4]] -; CHECK-NEXT: [[TMP117:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> [[TMP86]], <4 x i32> -; CHECK-NEXT: [[TMP118:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP117]]) -; CHECK-NEXT: [[TMP119]] = mul i32 [[TMP118]], [[VEC_PHI5]] -; CHECK-NEXT: [[TMP120:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> [[TMP110]], <4 x i32> -; CHECK-NEXT: [[TMP121:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP120]]) -; CHECK-NEXT: [[TMP122]] = mul i32 [[TMP121]], [[VEC_PHI6]] +; CHECK-NEXT: [[TMP106:%.*]] = phi <4 x i32> [ [[TMP100]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP105]], [[PRED_LOAD_IF37]] ] +; CHECK-NEXT: [[TMP107:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP34]], <4 x i32> +; CHECK-NEXT: [[TMP108:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP107]]) +; CHECK-NEXT: [[TMP109]] = mul i32 [[TMP108]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP110:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP58]], <4 x i32> +; CHECK-NEXT: [[TMP111:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP110]]) +; CHECK-NEXT: [[TMP112]] = mul i32 [[TMP111]], [[VEC_PHI4]] +; CHECK-NEXT: [[TMP113:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP82]], <4 x i32> +; CHECK-NEXT: [[TMP114:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP113]]) +; CHECK-NEXT: [[TMP115]] = mul i32 [[TMP114]], [[VEC_PHI5]] +; CHECK-NEXT: [[TMP116:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP106]], <4 x i32> +; CHECK-NEXT: [[TMP117:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP116]]) +; CHECK-NEXT: [[TMP118]] = mul i32 [[TMP117]], [[VEC_PHI6]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP123:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP123]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP119:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP119]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = mul i32 [[TMP116]], [[TMP113]] -; CHECK-NEXT: [[BIN_RDX45:%.*]] = mul i32 [[TMP119]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX46:%.*]] = mul i32 [[TMP122]], [[BIN_RDX45]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = mul i32 [[TMP112]], [[TMP109]] +; CHECK-NEXT: [[BIN_RDX39:%.*]] = mul i32 [[TMP115]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX40:%.*]] = mul i32 [[TMP118]], [[BIN_RDX39]] ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -523,7 +517,7 @@ ; CHECK: for.inc: ; CHECK-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i32 [ poison, [[FOR_INC]] ], [ [[BIN_RDX46]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i32 [ poison, [[FOR_INC]] ], [ [[BIN_RDX40]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[RES_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll --- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll @@ -203,12 +203,10 @@ ; CHECK-VF4UF2: %[[VEC_RECUR_INIT:.*]] = insertelement poison, i32 0, i32 %[[SUB1]] ; CHECK-VF4UF2: %[[SPLAT_INS1:.*]] = insertelement poison, i32 %x, i64 0 ; CHECK-VF4UF2: %[[SPLAT1:.*]] = shufflevector %[[SPLAT_INS1]], poison, zeroinitializer -; CHECK-VF4UF2: %[[SPLAT_INS2:.*]] = insertelement poison, i32 %x, i64 0 -; CHECK-VF4UF2: %[[SPLAT2:.*]] = shufflevector %[[SPLAT_INS2]], poison, zeroinitializer ; ; CHECK-VF4UF2: vector.body ; CHECK-VF4UF2: %[[VEC_RECUR:.*]] = phi [ %[[VEC_RECUR_INIT]], %vector.ph ], [ %[[ADD2:.*]], %vector.body ] ; CHECK-VF4UF2: %[[ADD1:.*]] = add %{{.*}}, %[[SPLAT1]] -; CHECK-VF4UF2: %[[ADD2]] = add %{{.*}}, %[[SPLAT2]] +; CHECK-VF4UF2: %[[ADD2]] = add %{{.*}}, %[[SPLAT1]] ; CHECK-VF4UF2: middle.block ; CHECK-VF4UF2: %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32() ; CHECK-VF4UF2: %[[MUL2:.*]] = mul i32 %[[VSCALE2]], 4 diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll --- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll @@ -147,8 +147,6 @@ ; CHECK: vector.ph: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x float> poison, float [[X]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT10]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -163,17 +161,17 @@ ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 4 ; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP5]], align 4, !alias.scope !7 ; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[WIDE_LOAD8]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x float> [[WIDE_LOAD9]], [[BROADCAST_SPLAT11]] +; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x float> [[WIDE_LOAD9]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP8]], align 4, !alias.scope !9, !noalias !11 +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP8]], align 4, !alias.scope !9, !noalias !11 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i64 4 -; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP9]], align 4, !alias.scope !9, !noalias !11 -; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP2]], <4 x float> , <4 x float> [[WIDE_LOAD12]] +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP9]], align 4, !alias.scope !9, !noalias !11 +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP2]], <4 x float> , <4 x float> [[WIDE_LOAD10]] ; CHECK-NEXT: [[PREDPHI:%.*]] = fadd <4 x float> [[TMP6]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP3]], <4 x float> , <4 x float> [[WIDE_LOAD13]] -; CHECK-NEXT: [[PREDPHI14:%.*]] = fadd <4 x float> [[TMP7]], [[TMP11]] +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP3]], <4 x float> , <4 x float> [[WIDE_LOAD11]] +; CHECK-NEXT: [[PREDPHI12:%.*]] = fadd <4 x float> [[TMP7]], [[TMP11]] ; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[TMP8]], align 4, !alias.scope !9, !noalias !11 -; CHECK-NEXT: store <4 x float> [[PREDPHI14]], ptr [[TMP9]], align 4, !alias.scope !9, !noalias !11 +; CHECK-NEXT: store <4 x float> [[PREDPHI12]], ptr [[TMP9]], align 4, !alias.scope !9, !noalias !11 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 ; CHECK-NEXT: br i1 [[TMP12]], label [[EXIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll @@ -24,21 +24,15 @@ ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[X4]], [[Y5]] ; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128 ; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[MIN_ITERS_CHECK]], i1 true, i1 [[DIFF_CHECK]] -; CHECK-NEXT: br i1 [[OR_COND]], label [[FOR_BODY_PREHEADER15:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[FOR_BODY_PREHEADER9:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT9]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT11]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT13]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP2:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT10]] -; CHECK-NEXT: [[TMP3:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT12]] -; CHECK-NEXT: [[TMP4:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT14]] +; CHECK-NEXT: [[TMP2:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP3:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -67,8 +61,8 @@ ; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER15]] -; CHECK: for.body.preheader15: +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER9]] +; CHECK: for.body.preheader9: ; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[TMP18:%.*]] = xor i64 [[INDVARS_IV_PH]], -1 ; CHECK-NEXT: [[TMP19:%.*]] = add nsw i64 [[TMP18]], [[WIDE_TRIP_COUNT]] @@ -91,10 +85,10 @@ ; CHECK-NEXT: [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_NEXT]], [[XTRAITER]] ; CHECK-NEXT: br i1 [[PROL_ITER_CMP_NOT]], label [[FOR_BODY_PROL_LOOPEXIT]], label [[FOR_BODY_PROL]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: for.body.prol.loopexit: -; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER15]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL]] ] +; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER9]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL]] ] ; CHECK-NEXT: [[TMP22:%.*]] = icmp ult i64 [[TMP19]], 7 -; CHECK-NEXT: br i1 [[TMP22]], label [[FOR_END]], label [[FOR_BODY_PREHEADER15_NEW:%.*]] -; CHECK: for.body.preheader15.new: +; CHECK-NEXT: br i1 [[TMP22]], label [[FOR_END]], label [[FOR_BODY_PREHEADER9_NEW:%.*]] +; CHECK: for.body.preheader9.new: ; CHECK-NEXT: [[TMP23:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: [[TMP24:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: [[TMP25:%.*]] = fdiv fast double 1.000000e+00, [[A]] @@ -105,7 +99,7 @@ ; CHECK-NEXT: [[TMP30:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], [[FOR_BODY_PREHEADER15_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], [[FOR_BODY_PREHEADER9_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[T0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[TMP31:%.*]] = fmul fast double [[T0]], [[TMP23]]