diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -320,12 +320,12 @@ getDecisionAndClampRange(const std::function<bool(ElementCount)> &Predicate, VFRange &Range); -protected: /// Collect the instructions from the original loop that would be trivially /// dead in the vectorized loop if generated. void collectTriviallyDeadInstructions( SmallPtrSetImpl<Instruction *> &DeadInstructions); +protected: /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -584,13 +584,6 @@ void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, VPTransformState &State); - /// Fixup the LCSSA phi nodes in the unique exit block. This simply - /// means we need to add the appropriate incoming value from the middle - /// block as exiting edges from the scalar epilogue loop (if present) are - /// already in place, and we exit the vector loop exclusively to the middle - /// block. - void fixLCSSAPHIs(VPTransformState &State); - /// Iteratively sink the scalarized operands of a predicated instruction into /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); @@ -3681,8 +3674,6 @@ getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), IVEndValues[Entry.first], LoopMiddleBlock, VectorLoop->getHeader()); - - fixLCSSAPHIs(State); } for (Instruction *PI : PredicatedInstructions) @@ -4051,35 +4042,6 @@ } } -void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { - for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { - if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) - // Some phis were already hand updated by the reduction and recurrence - // code above, leave them alone. - continue; - - auto *IncomingValue = LCSSAPhi.getIncomingValue(0); - // Non-instruction incoming values will have only one value. - - VPLane Lane = VPLane::getFirstLane(); - if (isa<Instruction>(IncomingValue) && - !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), - VF)) - Lane = VPLane::getLastLaneForVF(VF); - - // Can be a loop invariant incoming value or the last scalar value to be - // extracted from the vectorized loop. - // FIXME: Should not rely on getVPValue at this point. - Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); - Value *lastIncomingValue = - OrigLoop->isLoopInvariant(IncomingValue) - ? IncomingValue - : State.get(State.Plan->getVPValue(IncomingValue, true), - VPIteration(UF - 1, Lane)); - LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); - } -} - void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { // The basic block and loop containing the predicated instruction. auto *PredBB = PredInst->getParent(); @@ -8694,6 +8656,78 @@ EB->appendRecipe(BranchOnCount); } +// Add exit values to \p Plan. +static void +addUsersInExitBlock(VPBasicBlock *HeaderVPBB, VPBasicBlock *MiddleVPBB, + Loop *OrigLoop, VPBuilder &Builder, VPlan &Plan, + SmallPtrSetImpl<Instruction *> &DeadInstructions) { + // First, collect VPValues for which the exit user is not modeled yet in + // VPlan. Those include: + // 1. First-order recurrences, + // 2. Integer, fp & pointer inductions and their increments, + // 3. Reduction chains. + SmallPtrSet<VPValue *, 8> SkipExitValues; + for (VPRecipeBase &R : HeaderVPBB->phis()) { + PHINode *Phi = nullptr; + if (auto *Recur = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) { + SkipExitValues.insert(Recur); + } else if (auto *Ind = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) { + if (Ind->getTruncInst()) + continue; + Phi = Ind->getPHINode(); + } else if (isa<VPWidenPointerInductionRecipe>(&R)) + Phi = cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()); + if (Phi) { + SkipExitValues.insert(R.getVPSingleValue()); + auto *PostInc = Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); + + if (DeadInstructions.count(cast<Instruction>(PostInc))) + continue; + SkipExitValues.insert(Plan.getVPValue(PostInc)); + } + + if (!isa<VPReductionPHIRecipe>(&R)) + continue; + + SmallVector<VPValue *> WorkList; + SmallPtrSet<VPValue *, 8> Visited; + WorkList.push_back(R.getVPSingleValue()); + + while (!WorkList.empty()) { + VPValue *V = WorkList.pop_back_val(); + + for (VPUser *U : V->users()) { + auto *VPI = dyn_cast<VPInstruction>(U); + auto *R = dyn_cast<VPRecipeBase>(U); + if (!VPI) { + if (R && Visited.insert(R->getVPSingleValue()).second) + WorkList.push_back(R->getVPSingleValue()); + continue; + } + } + SkipExitValues.insert(V); + } + } + + // Introduce recipes modeling the exit values. They represent LCSSA phis and + // must be added to the beginning of the exit/middle block. + Builder.setInsertPoint(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); + BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); + if (!ExitBB) + return; + // Only handle single-exit loops for now. + if (!ExitBB->getSinglePredecessor()) + return; + for (PHINode &ExitPhi : ExitBB->phis()) { + Value *IncomingValue = + ExitPhi.getIncomingValueForBlock(OrigLoop->getLoopLatch()); + VPValue *V = Plan.getOrAddVPValue(IncomingValue, true); + if (SkipExitValues.count(V)) + continue; + Builder.createNaryOp(VPInstruction::ScalarExitValue, {V}, &ExitPhi); + } +} + VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, const MapVector<Instruction *, Instruction *> &SinkAfter) { @@ -8866,6 +8900,9 @@ // After here, VPBB should not be used. VPBB = nullptr; + addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, Builder, *Plan, + DeadInstructions); + assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry " @@ -10596,13 +10633,17 @@ Checks); VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); + VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); + for (VPRecipeBase &R : make_early_inc_range( + *cast<VPBasicBlock>(VectorLoop->getSingleSuccessor()))) { + R.eraseFromParent(); + } BestEpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->setName( "vec.epilog.vector.body"); // Ensure that the start values for any VPReductionPHIRecipes are // updated before vectorising the epilogue loop. - VPBasicBlock *Header = - BestEpiPlan.getVectorLoopRegion()->getEntryBasicBlock(); + VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); for (VPRecipeBase &R : Header->phis()) { if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { if (auto *Resume = MainILV.getReductionResumeValue( @@ -10613,6 +10654,22 @@ } } + // Update exit values for LCSSA phis in the exit block for the epilogue + // loop. + VPBasicBlock *MiddleVPBB = + cast<VPBasicBlock>(VectorLoop->getSingleSuccessor()); + for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) { + auto *VPI = dyn_cast<VPInstruction>(&R); + if (!VPI || VPI->getOpcode() != VPInstruction::ScalarExitValue) + continue; + VPI->eraseFromParent(); + } + VPBuilder B; + SmallPtrSet<Instruction *, 4> DeadInstructions; + LVP.collectTriviallyDeadInstructions(DeadInstructions); + addUsersInExitBlock(Header, MiddleVPBB, L, B, BestEpiPlan, + DeadInstructions); + LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT); ++LoopsEpilogueVectorized; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -814,6 +814,7 @@ CanonicalIVIncrement, CanonicalIVIncrementNUW, BranchOnCount, + ScalarExitValue, }; private: @@ -929,6 +930,19 @@ }; llvm_unreachable("switch should return"); } + + /// Returns true if the recipe uses scalars of operand \p Op. + bool usesScalars(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + switch (getOpcode()) { + default: + return onlyFirstLaneUsed(Op); + case VPInstruction::ScalarExitValue: + return true; + }; + llvm_unreachable("switch should return"); + } }; /// VPWidenRecipe is a recipe for producing a copy of vector type its @@ -2610,7 +2624,7 @@ assert(V && "Trying to get or add the VPValue of a null Value"); if (!Value2VPValue.count(V)) addVPValue(V); - return getVPValue(V); + return getVPValue(V, OverrideAllowed); } void removeVPValueFor(Value *V) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -332,6 +332,7 @@ // ExitBB can be re-used for the exit block of the Plan. NewBB = State->CFG.ExitBB; State->CFG.PrevBB = NewBB; + State->Builder.SetInsertPoint(NewBB, NewBB->begin()); } else if (PrevVPBB && /* A */ !((SingleHPred = getSingleHierarchicalPredecessor()) && SingleHPred->getExitBasicBlock() == PrevVPBB && @@ -810,6 +811,18 @@ Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); break; } + case VPInstruction::ScalarExitValue: { + if (Part != 0) + break; + PHINode *ExitPhi = cast<PHINode>(getUnderlyingValue()); + auto Lane = VPLane::getLastLaneForVF(State.VF); + if (getParent()->getPlan()->isUniformAfterVectorization(getOperand(0))) + Lane = VPLane::getFirstLane(); + ExitPhi->addIncoming( + State.get(getOperand(0), VPIteration(State.UF - 1, Lane)), + State.CFG.VPBB2IRBB[getParent()]); + break; + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -866,6 +879,9 @@ case VPInstruction::BranchOnCount: O << "branch-on-count "; break; + case VPInstruction::ScalarExitValue: + O << "scalar-exit-value "; + break; default: O << Instruction::getOpcodeName(getOpcode()); } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll @@ -52,11 +52,11 @@ ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], 2 ; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP21]], 1 ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <vscale x 2 x i64*> [[TMP12]], i32 [[TMP22]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[L_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[IDX]], [[L_ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll @@ -41,8 +41,8 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32*> [[BROADCAST_SPLAT]], zeroinitializer ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT]], i32 4, <16 x i1> [[TMP3]], <16 x i32> undef), !alias.scope !3 ; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1> -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i32> [[PREDPHI]], i64 15 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX6]], 8 @@ -68,8 +68,8 @@ ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i32*> [[BROADCAST_SPLAT18]], zeroinitializer ; CHECK-NEXT: [[WIDE_MASKED_GATHER21:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[BROADCAST_SPLAT18]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef) ; CHECK-NEXT: [[PREDPHI22:%.*]] = select <8 x i1> [[TMP8]], <8 x i32> [[WIDE_MASKED_GATHER21]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1> -; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC13]] ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[PREDPHI22]], i64 7 +; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC13]] ; CHECK-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll @@ -0,0 +1,71 @@ +; RUN: opt -passes=loop-vectorize -mtriple=x86_64-unknown-linux -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s + +define i32* @test(i32* noalias %src, i32* noalias %dst) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE2]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i64> [[VEC_IND]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true> +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK: pred.load.if: +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK: pred.load.continue: +; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ poison, %vector.body ], [ [[TMP8]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]] +; CHECK: pred.load.if1: +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP11]], i32 1 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK: pred.load.continue2: +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> zeroinitializer, <2 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], <2 x i32>* [[TMP16]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2> +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label %vector.body +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-NEXT: br i1 [[CMP_N]], label %exit, label %scalar.ph +; CHECK: exit: +; CHECK-NEXT: [[GEP_LCSSA:%.*]] = phi i32* [ %gep.src, %loop.latch ], [ [[TMP2]], %middle.block ] +; CHECK-NEXT: ret i32* [[GEP_LCSSA]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv + %cmp.1 = icmp eq i64 %iv, 0 + br i1 %cmp.1, label %loop.latch, label %then + +then: + %l = load i32, i32* %gep.src, align 4 + br label %loop.latch + +loop.latch: + %m = phi i32 [ %l, %then ], [ 0, %loop.header ] + %gep.dst = getelementptr inbounds i32, i32* %dst, i64 %iv + store i32 %m, i32* %gep.dst, align 4 + %iv.next = add nsw i64 %iv, 1 + %cmp.2 = icmp slt i64 %iv.next, 1000 + br i1 %cmp.2, label %loop.header, label %exit + +exit: + %gep.lcssa = phi i32* [ %gep.src, %loop.latch ] + ret i32* %gep.lcssa +} diff --git a/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll b/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll --- a/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll +++ b/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll @@ -22,8 +22,8 @@ ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -85,8 +85,8 @@ ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i64 3 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll --- a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll @@ -20,8 +20,8 @@ ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -80,8 +80,8 @@ ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -154,8 +154,8 @@ ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -218,8 +218,8 @@ ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll --- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll +++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll @@ -42,8 +42,8 @@ ; VF-TWO-CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VF-TWO-CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; VF-TWO-CHECK: middle.block: -; VF-TWO-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; VF-TWO-CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1 +; VF-TWO-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; VF-TWO-CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; VF-TWO-CHECK: vec.epilog.iter.check: ; VF-TWO-CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] @@ -70,8 +70,8 @@ ; VF-TWO-CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC5]] ; VF-TWO-CHECK-NEXT: br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP2:!llvm.loop !.*]] ; VF-TWO-CHECK: vec.epilog.middle.block: -; VF-TWO-CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC5]] ; VF-TWO-CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1 +; VF-TWO-CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC5]] ; VF-TWO-CHECK-NEXT: br i1 [[CMP_N8]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] ; VF-TWO-CHECK: vec.epilog.scalar.ph: ; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]