diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -401,7 +401,7 @@ /// is described by \p Desc. Value *createSelectCmpTargetReduction(IRBuilderBase &B, const TargetTransformInfo *TTI, - Value *Src, + Value *InitialVal, Value *Src, const RecurrenceDescriptor &Desc, PHINode *OrigPhi); @@ -410,8 +410,8 @@ /// required to implement the reduction. /// Fast-math-flags are propagated using the RecurrenceDescriptor. Value *createTargetReduction(IRBuilderBase &B, const TargetTransformInfo *TTI, - const RecurrenceDescriptor &Desc, Value *Src, - PHINode *OrigPhi = nullptr); + const RecurrenceDescriptor &Desc, Value *Start, + Value *Src, PHINode *OrigPhi = nullptr); /// Create an ordered reduction intrinsic using the given recurrence /// descriptor \p Desc. diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1031,13 +1031,12 @@ Value *llvm::createSelectCmpTargetReduction(IRBuilderBase &Builder, const TargetTransformInfo *TTI, - Value *Src, + Value *InitVal, Value *Src, const RecurrenceDescriptor &Desc, PHINode *OrigPhi) { assert(RecurrenceDescriptor::isSelectCmpRecurrenceKind( Desc.getRecurrenceKind()) && "Unexpected reduction kind"); - Value *InitVal = Desc.getRecurrenceStartValue(); Value *NewVal = nullptr; // First use the original phi to determine the new value we're trying to @@ -1113,8 +1112,8 @@ Value *llvm::createTargetReduction(IRBuilderBase &B, const TargetTransformInfo *TTI, - const RecurrenceDescriptor &Desc, Value *Src, - PHINode *OrigPhi) { + const RecurrenceDescriptor &Desc, + Value *Start, Value *Src, PHINode *OrigPhi) { // TODO: Support in-order reductions based on the recurrence descriptor. // All ops in the reduction inherit fast-math-flags from the recurrence // descriptor. @@ -1123,7 +1122,7 @@ RecurKind RK = Desc.getRecurrenceKind(); if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) - return createSelectCmpTargetReduction(B, TTI, Src, Desc, OrigPhi); + return createSelectCmpTargetReduction(B, TTI, Start, Src, Desc, OrigPhi); return createSimpleTargetReduction(B, TTI, Src, RK); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3932,7 +3932,8 @@ const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); RecurKind RK = RdxDesc.getRecurrenceKind(); - TrackingVH ReductionStartValue = RdxDesc.getRecurrenceStartValue(); + TrackingVH ReductionStartValue = + State.get(PhiR->getStartValue(), VPIteration(0, 0)); Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); State.setDebugLocFromInst(ReductionStartValue); @@ -4053,8 +4054,8 @@ // Create the reduction after the loop. Note that inloop reductions create the // target reduction in the loop using a Reduction recipe. if (VF.isVector() && !PhiR->isInLoop()) { - ReducedPartRdx = - createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); + ReducedPartRdx = createTargetReduction( + Builder, TTI, RdxDesc, ReductionStartValue, ReducedPartRdx, OrigPhi); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (PhiTy != RdxDesc.getRecurrenceType()) @@ -4064,7 +4065,7 @@ } PHINode *ResumePhi = - dyn_cast(PhiR->getStartValue()->getUnderlyingValue()); + dyn_cast_if_present(PhiR->getStartValue()->getUnderlyingValue()); // Create a phi node that merges control-flow from the backedge-taken check // block and the middle block. @@ -4081,7 +4082,7 @@ BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), Incoming); else - BCBlockPhi->addIncoming(ReductionStartValue, Incoming); + BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming); } // Set the resume value for this reduction @@ -8676,6 +8677,15 @@ Legal->getReductionVars().find(Phi)->second; assert(RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); + if (RecurrenceDescriptor::isSelectCmpRecurrenceKind( + RdxDesc.getRecurrenceKind())) { + VPBasicBlock *VecPreheader = cast( + Plan->getVectorLoopRegion()->getSinglePredecessor()); + if (!isGuaranteedNotToBeUndefOrPoison(StartV->getUnderlyingValue())) { + StartV = new VPInstruction(Instruction::Freeze, {StartV}); + VecPreheader->appendRecipe(StartV->getDefiningRecipe()); + } + } PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi), CM.useOrderedReductions(RdxDesc)); @@ -9601,7 +9611,9 @@ PrevInChain = NewRed; } else { PrevInChain = State.get(getChainOp(), Part); - NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); + NewRed = + createTargetReduction(State.Builder, TTI, *RdxDesc, + RdxDesc->getRecurrenceStartValue(), NewVecOp); } if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { NextInChain = diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -232,6 +232,12 @@ } switch (getOpcode()) { + case Instruction::Freeze: { + Value *A = State.get(getOperand(0), PartOrIter); + Value *V = Builder.CreateFreeze(A, Name); + State.set(this, V, PartOrIter); + break; + } case VPInstruction::Not: { Value *A = State.get(getOperand(0), PartOrIter); Value *V = Builder.CreateNot(A, Name); @@ -1299,7 +1305,7 @@ // Reductions do not have to start at zero. They can start with // any loop invariant values. VPValue *StartVPV = getStartValue(); - Value *StartV = StartVPV->getLiveInIRValue(); + Value *StartV = State.get(StartVPV, VPIteration(0, 0)); Value *Iden = nullptr; RecurKind RK = RdxDesc.getRecurrenceKind(); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll @@ -62,7 +62,8 @@ define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 %n) #0 { ; CHECK-VF4IC1-LABEL: @select_i32_from_icmp ; CHECK-VF4IC1: vector.ph: -; CHECK-VF4IC1: [[TMP1:%.*]] = insertelement poison, i32 %a, i64 0 +; CHECK-VF4IC1: [[FR:%.*]] = freeze i32 %a +; CHECK-VF4IC1: [[TMP1:%.*]] = insertelement poison, i32 [[FR]], i64 0 ; CHECK-VF4IC1-NEXT: [[SPLAT_OF_A:%.*]] = shufflevector [[TMP1]], poison, zeroinitializer ; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = insertelement poison, i32 %b, i64 0 ; CHECK-VF4IC1-NEXT: [[SPLAT_OF_B:%.*]] = shufflevector [[TMP2]], poison, zeroinitializer @@ -72,11 +73,11 @@ ; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq [[VEC_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) ; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select [[VEC_ICMP]], [[VEC_PHI]], [[SPLAT_OF_B]] ; CHECK-VF4IC1: middle.block: -; CHECK-VF4IC1-NEXT: [[FIN_INS:%.*]] = insertelement poison, i32 %a, i64 0 +; CHECK-VF4IC1-NEXT: [[FIN_INS:%.*]] = insertelement poison, i32 [[FR]], i64 0 ; CHECK-VF4IC1-NEXT: [[FIN_SPLAT:%.*]] = shufflevector [[FIN_INS]], poison, zeroinitializer ; CHECK-VF4IC1-NEXT: [[FIN_CMP:%.*]] = icmp ne [[VEC_SEL]], [[FIN_SPLAT]] ; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[FIN_CMP]]) -; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 [[FR]] ; CHECK-VF4IC4-LABEL: @select_i32_from_icmp ; CHECK-VF4IC4: vector.body: diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll --- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll @@ -117,7 +117,8 @@ define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 %n) { ; CHECK-LABEL: @select_i32_from_icmp ; CHECK-VF4IC1: vector.ph: -; CHECK-VF4IC1: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 +; CHECK-VF4IC1: [[FR:%.*]] = freeze i32 %a +; CHECK-VF4IC1: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[FR]], i64 0 ; CHECK-VF4IC1-NEXT: [[SPLAT_OF_A:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %b, i64 0 ; CHECK-VF4IC1-NEXT: [[SPLAT_OF_B:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer @@ -127,11 +128,11 @@ ; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], ; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> [[SPLAT_OF_B]] ; CHECK-VF4IC1: middle.block: -; CHECK-VF4IC1-NEXT: [[FIN_INS:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 +; CHECK-VF4IC1-NEXT: [[FIN_INS:%.*]] = insertelement <4 x i32> poison, i32 [[FR]], i64 0 ; CHECK-VF4IC1-NEXT: [[FIN_SPLAT:%.*]] = shufflevector <4 x i32> [[FIN_INS]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-VF4IC1-NEXT: [[FIN_CMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], [[FIN_SPLAT]] ; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_CMP]]) -; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 [[FR]] entry: br label %for.body @@ -214,7 +215,8 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) { ; CHECK-LABEL: @select_i32_from_icmp_same_inputs ; CHECK-VF4IC1: vector.ph: -; CHECK-VF4IC1: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 +; CHECK-VF4IC1: [[FR:%.*]] = freeze i32 %a +; CHECK-VF4IC1: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[FR]], i64 0 ; CHECK-VF4IC1-NEXT: [[SPLAT_OF_A:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %b, i64 0 ; CHECK-VF4IC1-NEXT: [[SPLAT_OF_B:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer @@ -223,11 +225,11 @@ ; CHECK-VF4IC1: [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_PHI]], ; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> [[SPLAT_OF_B]] ; CHECK-VF4IC1: middle.block: -; CHECK-VF4IC1-NEXT: [[FIN_INS:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 +; CHECK-VF4IC1-NEXT: [[FIN_INS:%.*]] = insertelement <4 x i32> poison, i32 [[FR]], i64 0 ; CHECK-VF4IC1-NEXT: [[FIN_SPLAT:%.*]] = shufflevector <4 x i32> [[FIN_INS]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-VF4IC1-NEXT: [[FIN_CMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], [[FIN_SPLAT]] ; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_CMP]]) -; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 [[FR]] entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll --- a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll +++ b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll @@ -8,26 +8,31 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = freeze i64 undef +; CHECK-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0 +; CHECK-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <2 x i64> [[MINMAX_IDENT_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ undef, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]] -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP4]] = select <2 x i1> [[TMP3]], <2 x i64> [[VEC_PHI]], <2 x i64> [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP5]] = select <2 x i1> [[TMP4]], <2 x i64> [[VEC_PHI]], <2 x i64> [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i64> [[TMP4]], undef -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]]) -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i64 [[A]], i64 undef +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i64> [[TMP5]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]]) +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i64 [[A]], i64 [[TMP0]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 32, 32 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -73,26 +78,31 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = freeze i64 poison +; CHECK-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0 +; CHECK-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <2 x i64> [[MINMAX_IDENT_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ poison, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]] -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP4]] = select <2 x i1> [[TMP3]], <2 x i64> [[VEC_PHI]], <2 x i64> [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP5]] = select <2 x i1> [[TMP4]], <2 x i64> [[VEC_PHI]], <2 x i64> [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i64> [[TMP4]], poison -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]]) -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i64 [[A]], i64 poison +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i64> [[TMP5]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]]) +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i64 [[A]], i64 [[TMP0]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 32, 32 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -138,30 +148,31 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[START]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = freeze i64 [[START]] +; CHECK-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0 ; CHECK-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <2 x i64> [[MINMAX_IDENT_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]] -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP4]] = select <2 x i1> [[TMP3]], <2 x i64> [[VEC_PHI]], <2 x i64> [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP5]] = select <2 x i1> [[TMP4]], <2 x i64> [[VEC_PHI]], <2 x i64> [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[START]], i64 0 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i64> [[TMP4]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]]) -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i64 [[A]], i64 [[START]] +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i64> [[TMP5]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]]) +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i64 [[A]], i64 [[TMP0]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 32, 32 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: