diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -567,7 +567,8 @@ /// Set up the values of the IVs correctly when exiting the vector loop. void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, - BasicBlock *MiddleBlock, BasicBlock *VectorHeader); + BasicBlock *MiddleBlock, BasicBlock *VectorHeader, + VPlan &Plan); /// Handle all cross-iteration phis in the header. void fixCrossIterationPHIs(VPTransformState &State); @@ -584,13 +585,6 @@ void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, VPTransformState &State); - /// Fixup the LCSSA phi nodes in the unique exit block. This simply - /// means we need to add the appropriate incoming value from the middle - /// block as exiting edges from the scalar epilogue loop (if present) are - /// already in place, and we exit the vector loop exclusively to the middle - /// block. - void fixLCSSAPHIs(VPTransformState &State); - /// Iteratively sink the scalarized operands of a predicated instruction into /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); @@ -3335,7 +3329,7 @@ const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, BasicBlock *MiddleBlock, - BasicBlock *VectorHeader) { + BasicBlock *VectorHeader, VPlan &Plan) { // There are two kinds of external IV usages - those that use the value // computed in the last iteration (the PHI) and those that use the penultimate // value (the value that feeds into the phi from the loop latch). @@ -3397,6 +3391,7 @@ // don't already have an incoming value for the middle block. if (PHI->getBasicBlockIndex(MiddleBlock) == -1) PHI->addIncoming(I.second, MiddleBlock); + Plan.removeLiveOut(PHI); } } @@ -3709,11 +3704,18 @@ fixupIVUsers(Entry.first, Entry.second, getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), IVEndValues[Entry.first], LoopMiddleBlock, - VectorLoop->getHeader()); - - fixLCSSAPHIs(State); + VectorLoop->getHeader(), Plan); + } else { + // Otherwise all exit values should be forwarded to the scalar loop earlier. + Plan.clearLiveOuts(); } + // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated + // in the exit block, so update the builder. + State.Builder.SetInsertPoint(State.CFG.ExitBB, State.CFG.ExitBB->begin()); + for (auto &KV : Plan.getLiveOuts()) + KV.second->fixPhi(Plan, State); + for (Instruction *PI : PredicatedInstructions) sinkScalarOperands(&*PI); @@ -3862,8 +3864,10 @@ // and thus no phis which needed updated. if (!Cost->requiresScalarEpilogue(VF)) for (PHINode &LCSSAPhi : LoopExitBlock->phis()) - if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) + if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) { LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); + State.Plan->removeLiveOut(&LCSSAPhi); + } } void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, @@ -3921,7 +3925,7 @@ // extra cost in the loop (for example a predicated vadd), it can be // cheaper for the select to remain in the loop than be sunk out of it, // and so use the select value for the phi instead of the old - // LoopExitValue. + // LoopLiveOut. if (PreferPredicatedReductionSelect || TTI->preferPredicatedReductionSelect( RdxDesc.getOpcode(), PhiTy, @@ -4046,8 +4050,10 @@ // fixFirstOrderRecurrence for a more complete explaination of the logic. if (!Cost->requiresScalarEpilogue(VF)) for (PHINode &LCSSAPhi : LoopExitBlock->phis()) - if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) + if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); + State.Plan->removeLiveOut(&LCSSAPhi); + } // Fix the scalar loop reduction variable with the incoming reduction sum // from the vector body and from the backedge value. @@ -4091,35 +4097,6 @@ } } -void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { - for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { - if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) - // Some phis were already hand updated by the reduction and recurrence - // code above, leave them alone. - continue; - - auto *IncomingValue = LCSSAPhi.getIncomingValue(0); - // Non-instruction incoming values will have only one value. - - VPLane Lane = VPLane::getFirstLane(); - if (isa<Instruction>(IncomingValue) && - !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), - VF)) - Lane = VPLane::getLastLaneForVF(VF); - - // Can be a loop invariant incoming value or the last scalar value to be - // extracted from the vectorized loop. - // FIXME: Should not rely on getVPValue at this point. - Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); - Value *lastIncomingValue = - OrigLoop->isLoopInvariant(IncomingValue) - ? IncomingValue - : State.get(State.Plan->getVPValue(IncomingValue, true), - VPIteration(UF - 1, Lane)); - LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); - } -} - void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { // The basic block and loop containing the predicated instruction. auto *PredBB = PredInst->getParent(); @@ -8729,6 +8706,26 @@ EB->appendRecipe(BranchOnCount); } +// Add exit values to \p Plan. +static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, + VPBasicBlock *MiddleVPBB, Loop *OrigLoop, + VPlan &Plan) { + BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); + if (!ExitBB) + return; + // Only handle single-exit loops for now. + if (!ExitBB->getSinglePredecessor()) + return; + + // Introduce VPUsers modeling the exit values. + for (PHINode &ExitPhi : ExitBB->phis()) { + Value *IncomingValue = + ExitPhi.getIncomingValueForBlock(OrigLoop->getLoopLatch()); + VPValue *V = Plan.getOrAddVPValue(IncomingValue, true); + Plan.addLiveOut(&ExitPhi, V); + } +} + VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, const MapVector<Instruction *, Instruction *> &SinkAfter) { @@ -8908,6 +8905,8 @@ // After here, VPBB should not be used. VPBB = nullptr; + addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); + assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry " @@ -9165,7 +9164,7 @@ continue; // ReductionOperations are orders top-down from the phi's use to the - // LoopExitValue. We keep a track of the previous item (the Chain) to tell + // LoopLiveOut. We keep a track of the previous item (the Chain) to tell // which of the two operands will remain scalar and which will be reduced. // For minmax the chain will be the select instructions. Instruction *Chain = Phi; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -664,6 +664,27 @@ #endif }; +/// A value that is used outside the VPlan. The operand of the user needs to be +/// added to the associated LCSSA phi node. +class VPLiveOut : public VPUser { + PHINode *Phi; + +public: + VPLiveOut(PHINode *Phi, VPValue *Op) + : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi) {} + + void fixPhi(VPlan &Plan, VPTransformState &State); + + /// Returns true if the VPLiveOut uses scalars of operand \p Op. + bool usesScalars(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } + + PHINode *getPhi() const { return Phi; } +}; + /// VPRecipeBase is a base class modeling a sequence of one or more output IR /// instructions. VPRecipeBase owns the the VPValues it defines through VPDef /// and is responsible for deleting its defined values. Single-value @@ -2482,6 +2503,9 @@ /// mapping cannot be used any longer, because it is stale. bool Value2VPValueEnabled = true; + /// Values used outside the plan. + DenseMap<PHINode *, VPLiveOut *> LiveOuts; + public: VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) { if (Entry) @@ -2489,6 +2513,8 @@ } ~VPlan() { + clearLiveOuts(); + if (Entry) { VPValue DummyValue; for (VPBlockBase *Block : depth_first(Entry)) @@ -2657,6 +2683,23 @@ return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin()); } + void addLiveOut(PHINode *PN, VPValue *V); + + void clearLiveOuts() { + for (auto &KV : LiveOuts) + delete KV.second; + LiveOuts.clear(); + } + + void removeLiveOut(PHINode *PN) { + delete LiveOuts[PN]; + LiveOuts.erase(PN); + } + + const DenseMap<PHINode *, VPLiveOut *> &getLiveOuts() const { + return LiveOuts; + } + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -647,6 +647,16 @@ } } +void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { + auto Lane = VPLane::getLastLaneForVF(State.VF); + if (Plan.isUniformAfterVectorization(getOperand(0))) + Lane = VPLane::getFirstLane(); + Phi->addIncoming(State.get(getOperand(0), VPIteration(State.UF - 1, Lane)), + State.CFG.VPBB2IRBB[Plan.getVectorLoopRegion() + ->getSingleSuccessor() + ->getExitBasicBlock()]); +} + void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { assert(!Parent && "Recipe already in some VPBasicBlock"); assert(InsertPos->getParent() && @@ -1065,6 +1075,17 @@ O << '\n'; Block->print(O, "", SlotTracker); } + + if (!LiveOuts.empty()) + O << "\n"; + for (auto &KV : LiveOuts) { + O << "Live-out "; + KV.second->getPhi()->printAsOperand(O); + O << " = "; + KV.second->getOperand(0)->printAsOperand(O, SlotTracker); + O << "\n"; + } + O << "}\n"; } @@ -1078,6 +1099,12 @@ void VPlan::dump() const { print(dbgs()); } #endif +void VPlan::addLiveOut(PHINode *PN, VPValue *V) { + assert(LiveOuts.find(PN) == LiveOuts.end() && + "an exit value for PN already exists"); + LiveOuts.insert({PN, new VPLiveOut(PN, V)}); +} + void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB, BasicBlock *LoopLatchBB, BasicBlock *LoopExitBB) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -208,6 +208,7 @@ /// Subclass identifier (for isa/dyn_cast). enum class VPUserID { Recipe, + LiveOut, // TODO: Currently VPUsers are used in VPBlockBase, but in the future the // only VPUsers should either be recipes or live-outs. Block diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -202,5 +202,11 @@ return false; } } + + for (auto &KV : Plan.getLiveOuts()) + if (KV.second->getNumOperands() != 1) { + errs() << "live outs must have a single operand\n"; + return false; + } return true; } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll @@ -52,11 +52,11 @@ ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], 2 ; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP21]], 1 ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <vscale x 2 x i64*> [[TMP12]], i32 [[TMP22]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[L_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[IDX]], [[L_ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll @@ -41,8 +41,8 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32*> [[BROADCAST_SPLAT]], zeroinitializer ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT]], i32 4, <16 x i1> [[TMP3]], <16 x i32> undef), !alias.scope !3 ; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1> -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i32> [[PREDPHI]], i64 15 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX6]], 8 @@ -68,8 +68,8 @@ ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i32*> [[BROADCAST_SPLAT16]], zeroinitializer ; CHECK-NEXT: [[WIDE_MASKED_GATHER19:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[BROADCAST_SPLAT16]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef) ; CHECK-NEXT: [[PREDPHI20:%.*]] = select <8 x i1> [[TMP8]], <8 x i32> [[WIDE_MASKED_GATHER19]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1> -; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC11]] ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[PREDPHI20]], i64 7 +; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC11]] ; CHECK-NEXT: br i1 [[CMP_N12]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll @@ -0,0 +1,71 @@ +; RUN: opt -passes=loop-vectorize -mtriple=x86_64-unknown-linux -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s + +define i32* @test(i32* noalias %src, i32* noalias %dst) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE2]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i64> [[VEC_IND]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true> +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK: pred.load.if: +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK: pred.load.continue: +; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ poison, %vector.body ], [ [[TMP8]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]] +; CHECK: pred.load.if1: +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP11]], i32 1 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK: pred.load.continue2: +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> zeroinitializer, <2 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], <2 x i32>* [[TMP16]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2> +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label %vector.body +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-NEXT: br i1 [[CMP_N]], label %exit, label %scalar.ph +; CHECK: exit: +; CHECK-NEXT: [[GEP_LCSSA:%.*]] = phi i32* [ %gep.src, %loop.latch ], [ [[TMP2]], %middle.block ] +; CHECK-NEXT: ret i32* [[GEP_LCSSA]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv + %cmp.1 = icmp eq i64 %iv, 0 + br i1 %cmp.1, label %loop.latch, label %then + +then: + %l = load i32, i32* %gep.src, align 4 + br label %loop.latch + +loop.latch: + %m = phi i32 [ %l, %then ], [ 0, %loop.header ] + %gep.dst = getelementptr inbounds i32, i32* %dst, i64 %iv + store i32 %m, i32* %gep.dst, align 4 + %iv.next = add nsw i64 %iv, 1 + %cmp.2 = icmp slt i64 %iv.next, 1000 + br i1 %cmp.2, label %loop.header, label %exit + +exit: + %gep.lcssa = phi i32* [ %gep.src, %loop.latch ] + ret i32* %gep.lcssa +} diff --git a/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll b/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll --- a/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll +++ b/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll @@ -22,8 +22,8 @@ ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -85,8 +85,8 @@ ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i64 3 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -225,6 +225,8 @@ ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: Live-out i32 %res = ir<%and.red.next> ; CHECK-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll --- a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll @@ -20,8 +20,8 @@ ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -80,8 +80,8 @@ ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -154,8 +154,8 @@ ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -218,8 +218,8 @@ ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll --- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll +++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll @@ -206,7 +206,7 @@ ; CHECK-LABEL: vector.body: ; CHECK-NEXT: [[INDEX:%.+]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.+]], %vector.body ] ; VEC-NEXT: [[VEC_IND:%.+]] = phi <2 x i64> [ <i64 0, i64 1>, %vector.ph ], [ [[VEC_IND_NEXT:%.+]], %vector.body ] -; CHECK-NEXT: [[IV_0:%.+]] = add i64 [[INDEX]], 0 +; CHECK: [[IV_0:%.+]] = add i64 [[INDEX]], 0 ; VEC-NOT: add i64 [[INDEX]], 1 ; CHECK-NOT: [[IV_2_0:%.+]] = add i32 %offset.idx, 0 ; CHECK-LABEL: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll --- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll +++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll @@ -42,8 +42,8 @@ ; VF-TWO-CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VF-TWO-CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VF-TWO-CHECK: middle.block: -; VF-TWO-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; VF-TWO-CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1 +; VF-TWO-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; VF-TWO-CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; VF-TWO-CHECK: vec.epilog.iter.check: ; VF-TWO-CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] @@ -70,8 +70,8 @@ ; VF-TWO-CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]] ; VF-TWO-CHECK-NEXT: br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; VF-TWO-CHECK: vec.epilog.middle.block: -; VF-TWO-CHECK-NEXT: [[CMP_N5:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC4]] ; VF-TWO-CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1 +; VF-TWO-CHECK-NEXT: [[CMP_N5:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC4]] ; VF-TWO-CHECK-NEXT: br i1 [[CMP_N5]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; VF-TWO-CHECK: vec.epilog.scalar.ph: ; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -127,6 +127,8 @@ ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: Live-out float %red.next.lcssa = ir<%red.next> ; CHECK-NEXT: } ; entry: @@ -363,6 +365,8 @@ ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: Live-out float %muladd.lcssa = ir<%muladd> ; CHECK-NEXT:} entry: @@ -552,6 +556,8 @@ ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: Live-out i32 %lcssa = ir<%add> ; CHECK-NEXT: } ; entry: