diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8183,18 +8183,6 @@ CM.isProfitableToScalarize(I, VF); }; - bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { - // Returns true if we should generate a scalar version of \p IV. - if (ShouldScalarizeInstruction(PhiOrTrunc, VF)) - return true; - auto isScalarInst = [&](User *U) -> bool { - auto *I = cast(U); - return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF); - }; - return any_of(PhiOrTrunc->users(), isScalarInst); - }, - Range); bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { return ShouldScalarizeInstruction(PhiOrTrunc, VF); @@ -8209,11 +8197,11 @@ vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); if (auto *TruncI = dyn_cast(PhiOrTrunc)) { return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI, - NeedsScalarIV, !NeedsScalarIVOnly); + !NeedsScalarIVOnly); } assert(isa(PhiOrTrunc) && "must be a phi node here"); return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, - NeedsScalarIV, !NeedsScalarIVOnly); + !NeedsScalarIVOnly); } VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1064,24 +1064,22 @@ class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue { PHINode *IV; const InductionDescriptor &IndDesc; - bool NeedsScalarIV; bool NeedsVectorIV; public: VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, const InductionDescriptor &IndDesc, - bool NeedsScalarIV, bool NeedsVectorIV) + bool NeedsVectorIV) : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start, Step}), VPValue(IV, this), IV(IV), IndDesc(IndDesc), - NeedsScalarIV(NeedsScalarIV), NeedsVectorIV(NeedsVectorIV) {} + NeedsVectorIV(NeedsVectorIV) {} VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, const InductionDescriptor &IndDesc, - TruncInst *Trunc, bool NeedsScalarIV, - bool NeedsVectorIV) + TruncInst *Trunc, bool NeedsVectorIV) : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start, Step}), VPValue(Trunc, this), IV(IV), IndDesc(IndDesc), - NeedsScalarIV(NeedsScalarIV), NeedsVectorIV(NeedsVectorIV) {} + NeedsVectorIV(NeedsVectorIV) {} ~VPWidenIntOrFpInductionRecipe() override = default; @@ -1132,9 +1130,6 @@ return TruncI ? TruncI->getType() : IV->getType(); } - /// Returns true if a scalar phi needs to be created for the induction. - bool needsScalarIV() const { return NeedsScalarIV; } - /// Returns true if a vector phi needs to be created for the induction. bool needsVectorIV() const { return NeedsVectorIV; } }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -51,8 +51,8 @@ VPValue *Start = Plan->getOrAddVPValue(II->getStartValue()); VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE); - NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II, - false, true); + NewRecipe = + new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II, true); } else { Plan->addVPValue(Phi, VPPhi); continue; @@ -417,7 +417,9 @@ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); for (VPRecipeBase &Phi : HeaderVPBB->phis()) { auto *IV = dyn_cast(&Phi); - if (!IV || !IV->needsScalarIV()) + if (!IV || all_of(IV->users(), [IV](VPUser *U) { + return !cast(U)->usesScalars(IV); + })) continue; const InductionDescriptor &ID = IV->getInductionDescriptor(); diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll --- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll @@ -246,46 +246,42 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND4:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT9:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[STEP_ADD5:%.*]] = add <4 x i32> [[VEC_IND4]], -; CHECK-NEXT: [[STEP_ADD6:%.*]] = add <4 x i32> [[STEP_ADD5]], -; CHECK-NEXT: [[STEP_ADD7:%.*]] = add <4 x i32> [[STEP_ADD6]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[VEC_IND4]], i32 0 -; CHECK-NEXT: store i32 [[TMP4]], i32* [[ADDR:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[VEC_IND4]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP0]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 5 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP0]], 6 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP0]], 7 +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 8 +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP0]], 9 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP0]], 10 +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 11 +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP0]], 12 +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP0]], 13 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 14 +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP0]], 15 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[ADDR:%.*]], align 4 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[ADDR]], align 4 +; CHECK-NEXT: store i32 [[TMP3]], i32* [[ADDR]], align 4 +; CHECK-NEXT: store i32 [[TMP4]], i32* [[ADDR]], align 4 ; CHECK-NEXT: store i32 [[TMP5]], i32* [[ADDR]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[VEC_IND4]], i32 2 ; CHECK-NEXT: store i32 [[TMP6]], i32* [[ADDR]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[VEC_IND4]], i32 3 ; CHECK-NEXT: store i32 [[TMP7]], i32* [[ADDR]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[STEP_ADD5]], i32 0 ; CHECK-NEXT: store i32 [[TMP8]], i32* [[ADDR]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[STEP_ADD5]], i32 1 ; CHECK-NEXT: store i32 [[TMP9]], i32* [[ADDR]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[STEP_ADD5]], i32 2 ; CHECK-NEXT: store i32 [[TMP10]], i32* [[ADDR]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[STEP_ADD5]], i32 3 ; CHECK-NEXT: store i32 [[TMP11]], i32* [[ADDR]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[STEP_ADD6]], i32 0 ; CHECK-NEXT: store i32 [[TMP12]], i32* [[ADDR]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[STEP_ADD6]], i32 1 ; CHECK-NEXT: store i32 [[TMP13]], i32* [[ADDR]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[STEP_ADD6]], i32 2 ; CHECK-NEXT: store i32 [[TMP14]], i32* [[ADDR]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[STEP_ADD6]], i32 3 ; CHECK-NEXT: store i32 [[TMP15]], i32* [[ADDR]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[STEP_ADD7]], i32 0 ; CHECK-NEXT: store i32 [[TMP16]], i32* [[ADDR]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[STEP_ADD7]], i32 1 -; CHECK-NEXT: store i32 [[TMP17]], i32* [[ADDR]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i32> [[STEP_ADD7]], i32 2 -; CHECK-NEXT: store i32 [[TMP18]], i32* [[ADDR]], align 4 -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[STEP_ADD7]], i32 3 -; CHECK-NEXT: store i32 [[TMP19]], i32* [[ADDR]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[VEC_IND_NEXT9]] = add <4 x i32> [[STEP_ADD7]], -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4097, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/induction-multiple-uses-in-same-instruction.ll b/llvm/test/Transforms/LoopVectorize/induction-multiple-uses-in-same-instruction.ll --- a/llvm/test/Transforms/LoopVectorize/induction-multiple-uses-in-same-instruction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction-multiple-uses-in-same-instruction.ll @@ -11,19 +11,18 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* [[PTR:%.*]], i64 0, i64 [[TMP0]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* [[PTR]], i64 0, i64 [[TMP1]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[VEC_IND]], i32 0 -; CHECK-NEXT: store i32 [[TMP4]], i32* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[VEC_IND]], i32 1 -; CHECK-NEXT: store i32 [[TMP5]], i32* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* [[PTR:%.*]], i64 0, i64 [[TMP3]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* [[PTR]], i64 0, i64 [[TMP4]], i64 [[TMP4]] +; CHECK-NEXT: store i32 [[TMP1]], i32* [[TMP5]], align 4 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[TMP6]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, 100 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll --- a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll +++ b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll @@ -38,30 +38,6 @@ ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 32, 32 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] -; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] -; CHECK-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i16 -; CHECK-NEXT: br label [[LOOP_COND:%.*]] -; CHECK: loop.cond: -; CHECK-NEXT: [[BLEND:%.*]] = phi i16 [ [[IV_TRUNC]], [[LOOP_HEADER]] ] -; CHECK-NEXT: [[SRC_PTR:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[BLEND]] -; CHECK-NEXT: [[LV:%.*]] = load i16, i16* [[SRC_PTR]], align 1 -; CHECK-NEXT: [[CMP_B:%.*]] = icmp sgt i64 [[IV]], [[A]] -; CHECK-NEXT: br i1 [[CMP_B]], label [[LOOP_NEXT:%.*]], label [[LOOP_LATCH]] -; CHECK: loop.next: -; CHECK-NEXT: br label [[LOOP_LATCH]] -; CHECK: loop.latch: -; CHECK-NEXT: [[RES:%.*]] = phi i16 [ [[LV]], [[LOOP_COND]] ], [ 1, [[LOOP_NEXT]] ] -; CHECK-NEXT: [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[IV]] -; CHECK-NEXT: store i16 [[RES]], i16* [[DST_PTR]], align 2 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[CMP439:%.*]] = icmp ult i64 [[IV]], 31 -; CHECK-NEXT: br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], [[LOOP2:!llvm.loop !.*]] -; CHECK: exit: -; CHECK-NEXT: ret void ; entry: br label %loop.header @@ -241,15 +217,15 @@ ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE4:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE4]] ] -; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <2 x i16> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[PRED_LOAD_CONTINUE4]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE2]] ] +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i16 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 ; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[VEC_IND1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = add i16 [[TMP0]], 0 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = load i16, i16* [[TMP5]], align 1 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i16> poison, i16 [[TMP6]], i32 0 @@ -257,31 +233,30 @@ ; CHECK: pred.load.continue: ; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]] -; CHECK: pred.load.if3: -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i16> [[VEC_IND1]], i32 1 +; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]] +; CHECK: pred.load.if1: +; CHECK-NEXT: [[TMP10:%.*]] = add i16 [[TMP0]], 1 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[TMP11]], align 1 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i16> [[TMP8]], i16 [[TMP12]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] -; CHECK: pred.load.continue4: -; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK: pred.load.continue2: +; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ] ; CHECK-NEXT: [[TMP15:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP16:%.*]] = xor <2 x i1> [[TMP2]], ; CHECK-NEXT: [[TMP17:%.*]] = xor <2 x i1> [[TMP15]], ; CHECK-NEXT: [[TMP18:%.*]] = select <2 x i1> [[TMP2]], <2 x i1> [[TMP17]], <2 x i1> zeroinitializer ; CHECK-NEXT: [[TMP19:%.*]] = select <2 x i1> [[TMP2]], <2 x i1> [[TMP15]], <2 x i1> zeroinitializer ; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP18]], <2 x i16> [[TMP14]], <2 x i16> zeroinitializer -; CHECK-NEXT: [[PREDPHI5:%.*]] = select <2 x i1> [[TMP19]], <2 x i16> , <2 x i16> [[PREDPHI]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP0]] +; CHECK-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP19]], <2 x i16> , <2 x i16> [[PREDPHI]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP1]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, i16* [[TMP20]], i32 0 ; CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[TMP21]] to <2 x i16>* -; CHECK-NEXT: store <2 x i16> [[PREDPHI5]], <2 x i16>* [[TMP22]], align 2 +; CHECK-NEXT: store <2 x i16> [[PREDPHI3]], <2 x i16>* [[TMP22]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], -; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <2 x i16> [[VEC_IND1]], ; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 -; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 64, 64 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -327,15 +302,15 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[VEC_IND]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1000, 1000 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/vect-phiscev-sext-trunc.ll b/llvm/test/Transforms/LoopVectorize/vect-phiscev-sext-trunc.ll --- a/llvm/test/Transforms/LoopVectorize/vect-phiscev-sext-trunc.ll +++ b/llvm/test/Transforms/LoopVectorize/vect-phiscev-sext-trunc.ll @@ -110,8 +110,12 @@ ; VF8-LABEL: @doit2 ; VF8: vector.body: -; VF8: %vec.ind = phi <8 x i64> -; VF8: %{{.*}} = extractelement <8 x i64> %vec.ind +; VF8-NEXT: [[INDEX:%.+]] = phi i64 [ 0, %vector.ph ] +; VF8-NEXT: [[I0:%.+]] = add i64 [[INDEX]], 0 +; VF8-NEXT: [[OFFSET_IDX:%.+]] = mul i64 [[INDEX]], %step +; VF8-NEXT: [[MUL0:%.+]] = mul i64 0, %step +; VF8-NEXT: [[ADD:%.+]] = add i64 [[OFFSET_IDX]], [[MUL0]] +; VF8: getelementptr inbounds i32, i32* %in, i64 [[ADD]] ; VF8: middle.block: ; VF1-LABEL: @doit2